From 0a81791f7a6a771c5602aecf5898280543b87590 Mon Sep 17 00:00:00 2001 From: Danny Tsen Date: Mon, 8 Sep 2025 11:53:55 -0400 Subject: [PATCH 01/16] Added optimized ppc64le support functions for ML-KEM. The supported native functions include: 1. MLK_USE_NATIVE_NTT (ntt_ppc.S) 2. MLK_USE_NATIVE_INTT (intt_ppc.S) 3. MLK_USE_NATIVE_POLY_REDUCE (reduce.S) 4. MLK_USE_NATIVE_POLY_TOMONT (poly_tomont.S) And other interface functions and headers. Signed-off-by: Danny Tsen --- dev/ppc64le/README.md | 6 + dev/ppc64le/meta.h | 49 ++ dev/ppc64le/src/arith_native_ppc64le.h | 23 + dev/ppc64le/src/intt_ppc.S | 773 ++++++++++++++++++ dev/ppc64le/src/ntt_ppc.S | 498 +++++++++++ dev/ppc64le/src/poly_tomont.S | 163 ++++ dev/ppc64le/src/reduce.S | 225 +++++ integration/liboqs/ML-KEM-1024_META.yml | 141 ++-- integration/liboqs/ML-KEM-512_META.yml | 141 ++-- integration/liboqs/ML-KEM-768_META.yml | 141 ++-- integration/liboqs/config_ppc64le.h | 266 ++++++ mlkem/src/native/meta.h | 4 + mlkem/src/native/ppc64le/README.md | 6 + mlkem/src/native/ppc64le/meta.h | 49 ++ .../native/ppc64le/src/arith_native_ppc64le.h | 23 + mlkem/src/native/ppc64le/src/intt_ppc.S | 773 ++++++++++++++++++ mlkem/src/native/ppc64le/src/ntt_ppc.S | 498 +++++++++++ mlkem/src/native/ppc64le/src/poly_tomont.S | 163 ++++ mlkem/src/native/ppc64le/src/reduce.S | 225 +++++ test/mk/auto.mk | 132 +-- test/mk/components.mk | 10 +- 21 files changed, 3970 insertions(+), 339 deletions(-) create mode 100644 dev/ppc64le/README.md create mode 100644 dev/ppc64le/meta.h create mode 100644 dev/ppc64le/src/arith_native_ppc64le.h create mode 100644 dev/ppc64le/src/intt_ppc.S create mode 100644 dev/ppc64le/src/ntt_ppc.S create mode 100644 dev/ppc64le/src/poly_tomont.S create mode 100644 dev/ppc64le/src/reduce.S create mode 100644 integration/liboqs/config_ppc64le.h create mode 100644 mlkem/src/native/ppc64le/README.md create mode 100644 mlkem/src/native/ppc64le/meta.h create mode 100644 mlkem/src/native/ppc64le/src/arith_native_ppc64le.h create mode 100644 mlkem/src/native/ppc64le/src/intt_ppc.S create mode 100644 mlkem/src/native/ppc64le/src/ntt_ppc.S create mode 100644 mlkem/src/native/ppc64le/src/poly_tomont.S create mode 100644 mlkem/src/native/ppc64le/src/reduce.S diff --git a/dev/ppc64le/README.md b/dev/ppc64le/README.md new file mode 100644 index 000000000..5125a40ea --- /dev/null +++ b/dev/ppc64le/README.md @@ -0,0 +1,6 @@ +[//]: # (SPDX-License-Identifier: CC-BY-4.0) + +# ppc64le backend (little endian) + +This directory contains a native backend for little endian POWER 8 (ppc64le) and above systems. + diff --git a/dev/ppc64le/meta.h b/dev/ppc64le/meta.h new file mode 100644 index 000000000..bee788976 --- /dev/null +++ b/dev/ppc64le/meta.h @@ -0,0 +1,49 @@ +/* + * Copyright (c) The mlkem-native project authors + * SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT + */ + +#ifndef MLK_NATIVE_PPC64LE_META_H +#define MLK_NATIVE_PPC64LE_META_H + +/* Identifier for this backend so that source and assembly files + * in the build can be appropriately guarded. */ +#define MLK_ARITH_BACKEND_PPC64LE_DEFAULT + +#define MLK_ARITH_BACKEND_NAME PPC64LE_DEFAULT + +/* Set of primitives that this backend replaces */ +#define MLK_USE_NATIVE_NTT +#define MLK_USE_NATIVE_INTT +#define MLK_USE_NATIVE_POLY_REDUCE +#define MLK_USE_NATIVE_POLY_TOMONT + +#if !defined(__ASSEMBLER__) +#include +#include "../../common.h" +#include "../../params.h" +#include "../api.h" +#include "src/arith_native_ppc64le.h" + +static MLK_INLINE int mlk_ntt_native(int16_t data[MLKEM_N]) { + mlk_ntt_ppc(data); + return MLK_NATIVE_FUNC_SUCCESS; +} + +static MLK_INLINE int mlk_intt_native(int16_t data[MLKEM_N]) { + mlk_intt_ppc(data); + return MLK_NATIVE_FUNC_SUCCESS; +} + +static MLK_INLINE int mlk_poly_reduce_native(int16_t data[MLKEM_N]) { + mlk_reduce_ppc(data); + return MLK_NATIVE_FUNC_SUCCESS; +} + +static MLK_INLINE int mlk_poly_tomont_native(int16_t data[MLKEM_N]) { + mlk_poly_tomont_ppc(data); + return MLK_NATIVE_FUNC_SUCCESS; +} +#endif /* !__ASSEMBLER__ */ + +#endif /* MLK_NATIVE_PPC64LE_META_H */ diff --git a/dev/ppc64le/src/arith_native_ppc64le.h b/dev/ppc64le/src/arith_native_ppc64le.h new file mode 100644 index 000000000..57f0b8f8c --- /dev/null +++ b/dev/ppc64le/src/arith_native_ppc64le.h @@ -0,0 +1,23 @@ +/* + * Copyright (c) 2024-2025 The mlkem-native project authors + * SPDX-License-Identifier: Apache-2.0 + */ +#ifndef MLK_NATIVE_PPC64LE_SRC_ARITH_NATIVE_PPC64LE_H +#define MLK_NATIVE_PPC64LE_SRC_ARITH_NATIVE_PPC64LE_H + +#include +#include "../../../common.h" + +#define mlk_ntt_ppc MLK_NAMESPACE(ntt_ppc) +void mlk_ntt_ppc(int16_t *); + +#define mlk_intt_ppc MLK_NAMESPACE(intt_ppc) +void mlk_intt_ppc(int16_t *); + +#define mlk_reduce_ppc MLK_NAMESPACE(reduce_ppc) +void mlk_reduce_ppc(int16_t *r); + +#define mlk_poly_tomont_ppc MLK_NAMESPACE(poly_tomont_ppc) +void mlk_poly_tomont_ppc(int16_t *); + +#endif /* MLK_NATIVE_PPC64LE_SRC_ARITH_NATIVE_PPC64LE_H */ diff --git a/dev/ppc64le/src/intt_ppc.S b/dev/ppc64le/src/intt_ppc.S new file mode 100644 index 000000000..feb78b984 --- /dev/null +++ b/dev/ppc64le/src/intt_ppc.S @@ -0,0 +1,773 @@ +/* + * Copyright (c) The mlkem-native project authors + * SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT + */ + +# +# Copyright 2025- IBM Corp. +# +#=================================================================================== +# Written by Danny Tsen +# + +#include "../../../common.h" + +.machine "any" +.text + +# Barrett reduce constatnts +#define V20159 0 +#define V_25 1 +#define V_26 2 +#define V_MKQ 3 + +# Montgomery reduce constatnts +#define V_QINV 2 +#define V_NMKQ 5 +#define V_Z0 7 +#define V_Z1 8 +#define V_Z2 9 +#define V_Z3 10 +#define V_ZETA 10 +#define V1441 10 + +.macro Load_4Coeffs start next step + mr 9, \start # j + add 10, 4, 9 # J + len*2 + addi 16, 9, \next + addi 17, 10, \step + addi 18, 16, \next + addi 19, 17, \step + addi 20, 18, \next + addi 21, 19, \step + lxvd2x 32+8, 3, 10 # r[j+len] + lxvd2x 32+12, 3, 17 # r[j+len] + lxvd2x 32+16, 3, 19 # r[j+len] + lxvd2x 32+20, 3, 21 # r[j+len] + xxpermdi 32+8, 32+8, 32+8, 2 + xxpermdi 32+12, 32+12, 32+12, 2 + xxpermdi 32+16, 32+16, 32+16, 2 + xxpermdi 32+20, 32+20, 32+20, 2 + + lxvd2x 32+21, 3, 9 + lxvd2x 32+22, 3, 16 + lxvd2x 32+23, 3, 18 + lxvd2x 32+24, 3, 20 + xxpermdi 32+21, 32+21, 32+21, 2 + xxpermdi 32+22, 32+22, 32+22, 2 + xxpermdi 32+23, 32+23, 32+23, 2 + xxpermdi 32+24, 32+24, 32+24, 2 + + vsubuhm 25, 8, 21 # r[j+len] - t + vsubuhm 26, 12, 22 # r[j+len] - t + vsubuhm 30, 16, 23 # r[j+len] - t + vsubuhm 31, 20, 24 # r[j+len] - t + vadduhm 8, 8, 21 # r[j+len] + t + vadduhm 12, 12, 22 # r[j+len] + t + vadduhm 16, 16, 23 # r[j+len] + t + vadduhm 20, 20, 24 # r[j+len] + t +.endm + +.macro BREDUCE_4X _v0 _v1 _v2 _v3 + vxor 7, 7, 7 + xxlor 32+3, 6, 6 # V_MKQ + xxlor 32+1, 7, 7 # V_25 + xxlor 32+2, 8, 8 # V_26 + vmulosh 6, 8, V20159 + vmulesh 5, 8, V20159 + vmulosh 11, 12, V20159 + vmulesh 10, 12, V20159 + vmulosh 15, 16, V20159 + vmulesh 14, 16, V20159 + vmulosh 19, 20, V20159 + vmulesh 18, 20, V20159 + xxmrglw 32+4, 32+5, 32+6 + xxmrghw 32+5, 32+5, 32+6 + xxmrglw 32+9, 32+10, 32+11 + xxmrghw 32+10, 32+10, 32+11 + xxmrglw 32+13, 32+14, 32+15 + xxmrghw 32+14, 32+14, 32+15 + xxmrglw 32+17, 32+18, 32+19 + xxmrghw 32+18, 32+18, 32+19 + vadduwm 4, 4, V_25 + vadduwm 5, 5, V_25 + vadduwm 9, 9, V_25 + vadduwm 10, 10, V_25 + vadduwm 13, 13, V_25 + vadduwm 14, 14, V_25 + vadduwm 17, 17, V_25 + vadduwm 18, 18, V_25 + vsraw 4, 4, V_26 + vsraw 5, 5, V_26 + vsraw 9, 9, V_26 + vsraw 10, 10, V_26 + vsraw 13, 13, V_26 + vsraw 14, 14, V_26 + vsraw 17, 17, V_26 + vsraw 18, 18, V_26 + vpkuwum 4, 5, 4 + vsubuhm 4, 7, 4 + vpkuwum 9, 10, 9 + vsubuhm 9, 7, 9 + vpkuwum 13, 14, 13 + vsubuhm 13, 7, 13 + vpkuwum 17, 18, 17 + vsubuhm 17, 7, 17 + vmladduhm \_v0, 4, V_MKQ, 8 + vmladduhm \_v1, 9, V_MKQ, 12 + vmladduhm \_v2, 13, V_MKQ, 16 + vmladduhm \_v3, 17, V_MKQ, 20 +.endm + +#----------------------------------- +# MREDUCE_4X(len, start, _vz0, _vz1, _vz2, _vz3) +# +.macro MREDUCE_4X _vz0 _vz1 _vz2 _vz3 _vo0 _vo1 _vo2 _vo3 + vmladduhm 15, 25, \_vz0, 3 + vmladduhm 20, 26, \_vz1, 3 + vmladduhm 27, 30, \_vz2, 3 + vmladduhm 28, 31, \_vz3, 3 + + vmhraddshs 14, 25, \_vz0, 3 + vmhraddshs 19, 26, \_vz1, 3 + vmhraddshs 24, 30, \_vz2, 3 + vmhraddshs 29, 31, \_vz3, 3 + + vmladduhm 15, 15, V_QINV, 3 + vmladduhm 20, 20, V_QINV, 3 + vmladduhm 25, 27, V_QINV, 3 + vmladduhm 30, 28, V_QINV, 3 + + vmhraddshs 15, 15, V_NMKQ, 14 + vmhraddshs 20, 20, V_NMKQ, 19 + vmhraddshs 25, 25, V_NMKQ, 24 + vmhraddshs 30, 30, V_NMKQ, 29 + + vsrah \_vo0, 15, 4 # >> 1 + vsrah \_vo1, 20, 4 # >> 1 + vsrah \_vo2, 25, 4 # >> 1 + vsrah \_vo3, 30, 4 # >> 1 +.endm + +.macro Set_mont_consts + xxlor 32+5, 0, 0 # V_NMKQ + xxlor 32+2, 2, 2 # V_QINV + xxlor 32+3, 3, 3 # 0 + xxlor 32+4, 4, 4 # 1 +.endm + +.macro Load_next_4zetas + lxv 32+V_Z0, 0(14) + lxv 32+V_Z1, 16(14) + lxv 32+V_Z2, 32(14) + lxv 32+V_Z3, 48(14) + addi 14, 14, 64 +.endm + +.macro Write_B4C _vs0 _vs1 _vs2 _vs3 + stxvx \_vs0, 3, 9 + stxvx \_vs1, 3, 16 + stxvx \_vs2, 3, 18 + stxvx \_vs3, 3, 20 +.endm + +.macro Write_M4C _vs0 _vs1 _vs2 _vs3 + stxvx \_vs0, 3, 10 + stxvx \_vs1, 3, 17 + stxvx \_vs2, 3, 19 + stxvx \_vs3, 3, 21 +.endm + +.macro Reload_4coeffs + lxv 32+25, 0(3) + lxv 32+26, 16(3) + lxv 32+30, 32(3) + lxv 32+31, 48(3) + addi 3, 3, 64 +.endm + +.macro MWrite_8X _vs0 _vs1 _vs2 _vs3 _vs4 _vs5 _vs6 _vs7 + stxv \_vs0, -128(3) + stxv \_vs1, -112(3) + stxv \_vs2, -96(3) + stxv \_vs3, -80(3) + stxv \_vs4, -64(3) + stxv \_vs5, -48(3) + stxv \_vs6, -32(3) + stxv \_vs7, -16(3) +.endm + +.macro Write_Len2_4C _vs0 _vs1 _vs2 _vs3 + xxmrglw 32+12, \_vs0, 10 + xxmrghw 32+11, \_vs0, 10 + xxpermdi 10, 32+12, 32+11, 3 + xxmrglw 32+16, \_vs1, 11 + xxmrghw 32+15, \_vs1, 11 + xxpermdi 11, 32+16, 32+15, 3 + xxmrglw 32+12, \_vs2, 12 + xxmrghw 32+11, \_vs2, 12 + xxpermdi 12, 32+12, 32+11, 3 + xxmrglw 32+16, \_vs3, 13 + xxmrghw 32+15, \_vs3, 13 + xxpermdi 13, 32+16, 32+15, 3 + stxvd2x 10, 3, 9 + stxvd2x 11, 3, 16 + stxvd2x 12, 3, 18 + stxvd2x 13, 3, 20 +.endm + +.macro Write_Len4_4C _vs0 _vs1 _vs2 _vs3 + xxpermdi 10, 10, \_vs0, 3 + xxpermdi 11, 11, \_vs1, 3 + xxpermdi 12, 12, \_vs2, 3 + xxpermdi 13, 13, \_vs3, 3 + stxvd2x 10, 3, 9 + stxvd2x 11, 3, 16 + stxvd2x 12, 3, 18 + stxvd2x 13, 3, 20 +.endm + +# intt +# t = r[j]; +# r[j] = barrett_reduce(t + r[j + len]); +# r[j + len] = r[j + len] - t; +# r[j + len] = fqmul(zeta, r[j + len]); + +# +# mlk_intt_ppc(r) +# +.global MLK_ASM_NAMESPACE(intt_ppc) +.align 4 +MLK_ASM_FN_SYMBOL(intt_ppc) + + stdu 1, -352(1) + mflr 0 + std 14, 56(1) + std 15, 64(1) + std 16, 72(1) + std 17, 80(1) + std 18, 88(1) + std 19, 96(1) + std 20, 104(1) + std 21, 112(1) + stxv 32+20, 128(1) + stxv 32+21, 144(1) + stxv 32+22, 160(1) + stxv 32+23, 176(1) + stxv 32+24, 192(1) + stxv 32+25, 208(1) + stxv 32+26, 224(1) + stxv 32+27, 240(1) + stxv 32+28, 256(1) + stxv 32+29, 272(1) + stxv 32+30, 288(1) + stxv 32+31, 304(1) + + # init vectors and constants + # Setup for Montgomery reduce + addis 8,2,.nmkq@toc@ha + addi 8,8,.nmkq@toc@l + lxv 0, 0(8) + + lxv 32+V_QINV, 16(8) # QINV + xxlxor 32+3, 32+3, 32+3 + vspltish 4, 1 + xxlor 2, 32+2, 32+2 + xxlor 3, 32+3, 32+3 + xxlor 4, 32+4, 32+4 + + # Setup for Barrett reduce + addis 8,2,.mkq@toc@ha + addi 8,8,.mkq@toc@l + addis 9,2,.C20159@toc@ha + addi 9,9,.C20159@toc@l + addis 10,2,.C25@toc@ha + addi 10,10,.C25@toc@l + + lxv 6, 0(8) # V_MKQ + lxv 32+0, 0(9) # V20159 + lxv 7, 0(10) # V_25 + + #xxspltiw 8, 26 # for power9 and above + vspltisw 8, 13 + vadduwm 8, 8, 8 + xxlor 8, 32+8, 32+8 + + # zetas array + #addis 14,2,.izeta63@toc@ha + #addi 14,14,.izeta63@toc@l + +.align 4 +__Len2: + # + # 1. len = 2, start = 0, 4, 8, 12,...244, 248, 252 + # Update zetas vectors, each vector has 2 zetas + addis 14,2,.izeta127@toc@ha + addi 14,14,.izeta127@toc@l + li 4, 4 + li 15, 4 + mtctr 15 + li 5, 0 +__Loop2: + Load_4Coeffs 5, 16, 16 + BREDUCE_4X 4, 9, 13, 17 + xxlor 10, 32+4, 32+4 + xxlor 11, 32+9, 32+9 + xxlor 12, 32+13, 32+13 + xxlor 13, 32+17, 32+17 + Set_mont_consts + Load_next_4zetas + MREDUCE_4X V_Z0, V_Z1, V_Z2, V_Z3, 13, 18, 23, 28 + Write_Len2_4C 32+13, 32+18, 32+23, 32+28 + + addi 5, 5, 64 + + Load_4Coeffs 5, 16, 16 + BREDUCE_4X 4, 9, 13, 17 + xxlor 10, 32+4, 32+4 + xxlor 11, 32+9, 32+9 + xxlor 12, 32+13, 32+13 + xxlor 13, 32+17, 32+17 + Set_mont_consts + Load_next_4zetas + MREDUCE_4X V_Z0, V_Z1, V_Z2, V_Z3, 13, 18, 23, 28 + Write_Len2_4C 32+13, 32+18, 32+23, 32+28 + addi 5, 5, 64 + bdnz __Loop2 + +.align 4 +__Len4: + # + # 2. len = 4, start = 0, 8, 16, 24,...232, 240, 248 + addis 14,2,.izeta63@toc@ha + addi 14,14,.izeta63@toc@l + li 5, 0 + li 4, 8 + li 15, 4 # loops + mtctr 15 +__Loop4: + Load_4Coeffs 5, 16, 16 + BREDUCE_4X 4, 9, 13, 17 + xxlor 10, 32+4, 32+4 + xxlor 11, 32+9, 32+9 + xxlor 12, 32+13, 32+13 + xxlor 13, 32+17, 32+17 + Set_mont_consts + Load_next_4zetas + MREDUCE_4X V_Z0, V_Z1, V_Z2, V_Z3, 13, 18, 23, 28 + Write_Len4_4C 32+13, 32+18, 32+23, 32+28 + addi 5, 5, 64 + + Load_4Coeffs 5, 16, 16 + BREDUCE_4X 4, 9, 13, 17 + xxlor 10, 32+4, 32+4 + xxlor 11, 32+9, 32+9 + xxlor 12, 32+13, 32+13 + xxlor 13, 32+17, 32+17 + Set_mont_consts + Load_next_4zetas + MREDUCE_4X V_Z0, V_Z1, V_Z2, V_Z3, 13, 18, 23, 28 + Write_Len4_4C 32+13, 32+18, 32+23, 32+28 + addi 5, 5, 64 + bdnz __Loop4 + +.align 4 +__Len8: + # 3. len = 8, start = 0, 16, 32, 48,...208, 224, 240 + #addi 14, 14, 512 + li 4, 16 + li 5, 0 + + Load_4Coeffs 5, 32, 32 + BREDUCE_4X 4, 9, 13, 17 + Write_B4C 32+4, 32+9, 32+13, 32+17 + Set_mont_consts + Load_next_4zetas + MREDUCE_4X V_Z0, V_Z1, V_Z2, V_Z3, 13, 18, 23, 28 + Write_M4C 32+13, 32+18, 32+23, 32+28 + li 5, 128 + + Load_4Coeffs 5, 32, 32 + BREDUCE_4X 4, 9, 13, 17 + Write_B4C 32+4, 32+9, 32+13, 32+17 + Set_mont_consts + Load_next_4zetas + MREDUCE_4X V_Z0, V_Z1, V_Z2, V_Z3, 13, 18, 23, 28 + Write_M4C 32+13, 32+18, 32+23, 32+28 + li 5, 256 + + Load_4Coeffs 5, 32, 32 + BREDUCE_4X 4, 9, 13, 17 + Write_B4C 32+4, 32+9, 32+13, 32+17 + Set_mont_consts + Load_next_4zetas + MREDUCE_4X V_Z0, V_Z1, V_Z2, V_Z3, 13, 18, 23, 28 + Write_M4C 32+13, 32+18, 32+23, 32+28 + li 5, 384 + + Load_4Coeffs 5, 32, 32 + BREDUCE_4X 4, 9, 13, 17 + Write_B4C 32+4, 32+9, 32+13, 32+17 + Set_mont_consts + Load_next_4zetas + MREDUCE_4X V_Z0, V_Z1, V_Z2, V_Z3, 13, 18, 23, 28 + Write_M4C 32+13, 32+18, 32+23, 32+28 + +.align 4 +__Len16: + # + # 4. len = 16, start = 0, 32, 64,,...160, 192, 224 + #addi 14, 14, 768 + li 5, 0 + li 4, 32 + + Load_4Coeffs 5, 64, 64 + BREDUCE_4X 4, 9, 13, 17 + Write_B4C 32+4, 32+9, 32+13, 32+17 + Set_mont_consts + Load_next_4zetas + MREDUCE_4X V_Z0, V_Z1, V_Z2, V_Z3, 13, 18, 23, 28 + Write_M4C 32+13, 32+18, 32+23, 32+28 + li 5, 16 + Load_4Coeffs 5, 64, 64 + BREDUCE_4X 4, 9, 13, 17 + Write_B4C 32+4, 32+9, 32+13, 32+17 + Set_mont_consts + addi 14, 14, -64 + Load_next_4zetas + MREDUCE_4X V_Z0, V_Z1, V_Z2, V_Z3, 13, 18, 23, 28 + Write_M4C 32+13, 32+18, 32+23, 32+28 + + li 5, 256 + Load_4Coeffs 5, 64, 64 + BREDUCE_4X 4, 9, 13, 17 + Write_B4C 32+4, 32+9, 32+13, 32+17 + Set_mont_consts + Load_next_4zetas + MREDUCE_4X V_Z0, V_Z1, V_Z2, V_Z3, 13, 18, 23, 28 + Write_M4C 32+13, 32+18, 32+23, 32+28 + + li 5, 272 + Load_4Coeffs 5, 64, 64 + BREDUCE_4X 4, 9, 13, 17 + Write_B4C 32+4, 32+9, 32+13, 32+17 + Set_mont_consts + addi 14, 14, -64 + Load_next_4zetas + MREDUCE_4X V_Z0, V_Z1, V_Z2, V_Z3, 13, 18, 23, 28 + Write_M4C 32+13, 32+18, 32+23, 32+28 + +.align 4 +__Len32: + # + # 5. len = 32, start = 0, 64, 128, 192 + #addi 14, 14, 896 + li 5, 0 + li 4, 64 + + Load_4Coeffs 5, 16, 16 + BREDUCE_4X 4, 9, 13, 17 + Write_B4C 32+4, 32+9, 32+13, 32+17 + Set_mont_consts + lvx V_ZETA, 0, 14 + addi 14, 14, 16 + MREDUCE_4X V_ZETA, V_ZETA, V_ZETA, V_ZETA, 13, 18, 23, 28 + Write_M4C 32+13, 32+18, 32+23, 32+28 + li 5, 128 + + Load_4Coeffs 5, 16, 16 + BREDUCE_4X 4, 9, 13, 17 + Write_B4C 32+4, 32+9, 32+13, 32+17 + Set_mont_consts + lvx V_ZETA, 0, 14 + addi 14, 14, 16 + MREDUCE_4X V_ZETA, V_ZETA, V_ZETA, V_ZETA, 13, 18, 23, 28 + Write_M4C 32+13, 32+18, 32+23, 32+28 + li 5, 256 + + Load_4Coeffs 5, 16, 16 + BREDUCE_4X 4, 9, 13, 17 + Write_B4C 32+4, 32+9, 32+13, 32+17 + Set_mont_consts + lvx V_ZETA, 0, 14 + addi 14, 14, 16 + MREDUCE_4X V_ZETA, V_ZETA, V_ZETA, V_ZETA, 13, 18, 23, 28 + Write_M4C 32+13, 32+18, 32+23, 32+28 + li 5, 384 + + Load_4Coeffs 5, 16, 16 + BREDUCE_4X 4, 9, 13, 17 + Write_B4C 32+4, 32+9, 32+13, 32+17 + Set_mont_consts + lvx V_ZETA, 0, 14 + addi 14, 14, 16 + MREDUCE_4X V_ZETA, V_ZETA, V_ZETA, V_ZETA, 13, 18, 23, 28 + Write_M4C 32+13, 32+18, 32+23, 32+28 + +.align 4 +__Len64: + # + # 6. len = 64, start = 0, 128 + #addi 14, 14, 960 + li 5, 0 + li 4, 128 + Load_4Coeffs 5, 16, 16 + BREDUCE_4X 4, 9, 13, 17 + Write_B4C 32+4, 32+9, 32+13, 32+17 + Set_mont_consts + lvx V_ZETA, 0, 14 + addi 14, 14, 16 + MREDUCE_4X V_ZETA, V_ZETA, V_ZETA, V_ZETA, 13, 18, 23, 28 + Write_M4C 32+13, 32+18, 32+23, 32+28 + li 5, 64 + + Load_4Coeffs 5, 16, 16 + BREDUCE_4X 4, 9, 13, 17 + Write_B4C 32+4, 32+9, 32+13, 32+17 + Set_mont_consts + lxv 32+10, -16(14) + MREDUCE_4X V_ZETA, V_ZETA, V_ZETA, V_ZETA, 13, 18, 23, 28 + Write_M4C 32+13, 32+18, 32+23, 32+28 + li 5, 256 + + Load_4Coeffs 5, 16, 16 + BREDUCE_4X 4, 9, 13, 17 + Write_B4C 32+4, 32+9, 32+13, 32+17 + Set_mont_consts + lvx V_ZETA, 0, 14 + addi 14, 14, 16 + MREDUCE_4X V_ZETA, V_ZETA, V_ZETA, V_ZETA, 13, 18, 23, 28 + Write_M4C 32+13, 32+18, 32+23, 32+28 + li 5, 320 + + Load_4Coeffs 5, 16, 16 + BREDUCE_4X 4, 9, 13, 17 + Write_B4C 32+4, 32+9, 32+13, 32+17 + Set_mont_consts + lxv 32+10, -16(14) + MREDUCE_4X V_ZETA, V_ZETA, V_ZETA, V_ZETA, 13, 18, 23, 28 + Write_M4C 32+13, 32+18, 32+23, 32+28 + +.align 4 +__Len128: + # 7. len = 128, start = 0 + # + #addi 14, 14, 992 + li 5, 0 # start + li 4, 256 # len * 2 + + Load_4Coeffs 5, 16, 16 + BREDUCE_4X 4, 9, 13, 17 + Write_B4C 32+4, 32+9, 32+13, 32+17 + Set_mont_consts + lvx V_ZETA, 0, 14 + xxlor 9, 32+10, 32+10 + MREDUCE_4X V_ZETA, V_ZETA, V_ZETA, V_ZETA, 13, 18, 23, 28 + Write_M4C 32+13, 32+18, 32+23, 32+28 + li 5, 64 + + Load_4Coeffs 5, 16, 16 + BREDUCE_4X 4, 9, 13, 17 + Write_B4C 32+4, 32+9, 32+13, 32+17 + Set_mont_consts + xxlor 32+10, 9, 9 + MREDUCE_4X V_ZETA, V_ZETA, V_ZETA, V_ZETA, 13, 18, 23, 28 + Write_M4C 32+13, 32+18, 32+23, 32+28 + li 5, 128 + + Load_4Coeffs 5, 16, 16 + BREDUCE_4X 4, 9, 13, 17 + Write_B4C 32+4, 32+9, 32+13, 32+17 + Set_mont_consts + xxlor 32+10, 9, 9 + MREDUCE_4X V_ZETA, V_ZETA, V_ZETA, V_ZETA, 13, 18, 23, 28 + Write_M4C 32+13, 32+18, 32+23, 32+28 + li 5, 192 + + Load_4Coeffs 5, 16, 16 + BREDUCE_4X 4, 9, 13, 17 + Write_B4C 32+4, 32+9, 32+13, 32+17 + Set_mont_consts + xxlor 32+10, 9, 9 + MREDUCE_4X V_ZETA, V_ZETA, V_ZETA, V_ZETA, 13, 18, 23, 28 + Write_M4C 32+13, 32+18, 32+23, 32+28 + +.align 4 + # + # Montgomery reduce loops with constant 1441 + # + addis 10,2,.C1441@toc@ha + addi 10,10,.C1441@toc@l + lvx V1441, 0, 10 + + Reload_4coeffs + MREDUCE_4X V1441, V1441, V1441, V1441, 6, 7, 8, 9 + Reload_4coeffs + MREDUCE_4X V1441, V1441, V1441, V1441, 13, 18, 23, 28 + MWrite_8X 32+6, 32+7, 32+8, 32+9, 32+13, 32+18, 32+23, 32+28 + + Reload_4coeffs + MREDUCE_4X V1441, V1441, V1441, V1441, 6, 7, 8, 9 + Reload_4coeffs + MREDUCE_4X V1441, V1441, V1441, V1441, 13, 18, 23, 28 + MWrite_8X 32+6, 32+7, 32+8, 32+9, 32+13, 32+18, 32+23, 32+28 + + Reload_4coeffs + MREDUCE_4X V1441, V1441, V1441, V1441, 6, 7, 8, 9 + Reload_4coeffs + MREDUCE_4X V1441, V1441, V1441, V1441, 13, 18, 23, 28 + MWrite_8X 32+6, 32+7, 32+8, 32+9, 32+13, 32+18, 32+23, 32+28 + + Reload_4coeffs + MREDUCE_4X V1441, V1441, V1441, V1441, 6, 7, 8, 9 + Reload_4coeffs + MREDUCE_4X V1441, V1441, V1441, V1441, 13, 18, 23, 28 + MWrite_8X 32+6, 32+7, 32+8, 32+9, 32+13, 32+18, 32+23, 32+28 + +__intt_out: + lxv 32+20, 128(1) + lxv 32+21, 144(1) + lxv 32+22, 160(1) + lxv 32+23, 176(1) + lxv 32+24, 192(1) + lxv 32+25, 208(1) + lxv 32+26, 224(1) + lxv 32+27, 240(1) + lxv 32+28, 256(1) + lxv 32+29, 272(1) + lxv 32+30, 288(1) + lxv 32+31, 304(1) + ld 14, 56(1) + ld 15, 64(1) + ld 16, 72(1) + ld 16, 72(1) + ld 17, 80(1) + ld 18, 88(1) + ld 19, 96(1) + ld 20, 104(1) + ld 21, 112(1) + + mtlr 0 + addi 1, 1, 352 + blr + +.data +.align 4 +# -MLKEM_Q +.nmkq: +.short -3329, -3329, -3329, -3329, -3329, -3329, -3329, -3329 +# QINV +.short -3327, -3327, -3327, -3327, -3327, -3327, -3327, -3327 + +# MLKEM_Q +.mkq: +.short 3329, 3329, 3329, 3329, 3329, 3329, 3329, 3329 + +.C20159: +.short 20159, 20159, 20159, 20159, 20159, 20159, 20159, 20159 + +# 0x2000000 +.C25: +.long 33554432, 33554432, 33554432, 33554432 + +.C1441: +.short 1441, 1441, 1441, 1441, 1441, 1441, 1441, 1441 + +.align 4 +.izeta127: +.short 1628, 1628, 1628, 1628, 1522, 1522, 1522, 1522 +.short -1460, -1460, -1460, -1460, 958, 958, 958, 958 +.short 991, 991, 991, 991, 996, 996, 996, 996 +.short -308, -308, -308, -308, -108, -108, -108, -108 +.short 478, 478, 478, 478, -870, -870, -870, -870 +.short -854, -854, -854, -854, -1510, -1510, -1510, -1510 +.short 794, 794, 794, 794, -1278, -1278, -1278, -1278 +.short -1530, -1530, -1530, -1530, -1185, -1185, -1185, -1185 +.short -1659, -1659, -1659, -1659, -1187, -1187, -1187, -1187 +.short 220, 220, 220, 220, -874, -874, -874, -874 +.short -1335, -1335, -1335, -1335, 1218, 1218, 1218, 1218 +.short -136, -136, -136, -136, -1215, -1215, -1215, -1215 +.short 384, 384, 384, 384, -1465, -1465, -1465, -1465 +.short -1285, -1285, -1285, -1285, 1322, 1322, 1322, 1322 +.short 610, 610, 610, 610, 603, 603, 603, 603 +.short 1097, 1097, 1097, 1097, 817, 817, 817, 817 +.short -75, -75, -75, -75, -156, -156, -156, -156 +.short 329, 329, 329, 329, 418, 418, 418, 418 +.short 349, 349, 349, 349, -872, -872, -872, -872 +.short 644, 644, 644, 644, -1590, -1590, -1590, -1590 +.short 1119, 1119, 1119, 1119, -602, -602, -602, -602 +.short 1483, 1483, 1483, 1483, -777, -777, -777, -777 +.short -147, -147, -147, -147, 1159, 1159, 1159, 1159 +.short 778, 778, 778, 778, -246, -246, -246, -246 +.short 1653, 1653, 1653, 1653, 1574, 1574, 1574, 1574 +.short -460, -460, -460, -460, -291, -291, -291, -291 +.short -235, -235, -235, -235, 177, 177, 177, 177 +.short 587, 587, 587, 587, 422, 422, 422, 422 +.short 105, 105, 105, 105, 1550, 1550, 1550, 1550 +.short 871, 871, 871, 871, -1251, -1251, -1251, -1251 +.short 843, 843, 843, 843, 555, 555, 555, 555 +.short 430, 430, 430, 430, -1103, -1103, -1103, -1103 +.izeta63: +.short -1275, -1275, -1275, -1275, -1275, -1275, -1275, -1275 +.short 677, 677, 677, 677, 677, 677, 677, 677 +.short -1065, -1065, -1065, -1065, -1065, -1065, -1065, -1065 +.short 448, 448, 448, 448, 448, 448, 448, 448 +.short -725, -725, -725, -725, -725, -725, -725, -725 +.short -1508, -1508, -1508, -1508, -1508, -1508, -1508, -1508 +.short 961, 961, 961, 961, 961, 961, 961, 961 +.short -398, -398, -398, -398, -398, -398, -398, -398 +.short -951, -951, -951, -951, -951, -951, -951, -951 +.short -247, -247, -247, -247, -247, -247, -247, -247 +.short -1421, -1421, -1421, -1421, -1421, -1421, -1421, -1421 +.short 107, 107, 107, 107, 107, 107, 107, 107 +.short 830, 830, 830, 830, 830, 830, 830, 830 +.short -271, -271, -271, -271, -271, -271, -271, -271 +.short -90, -90, -90, -90, -90, -90, -90, -90 +.short -853, -853, -853, -853, -853, -853, -853, -853 +.short 1469, 1469, 1469, 1469, 1469, 1469, 1469, 1469 +.short 126, 126, 126, 126, 126, 126, 126, 126 +.short -1162, -1162, -1162, -1162, -1162, -1162, -1162, -1162 +.short -1618, -1618, -1618, -1618, -1618, -1618, -1618, -1618 +.short -666, -666, -666, -666, -666, -666, -666, -666 +.short -320, -320, -320, -320, -320, -320, -320, -320 +.short -8, -8, -8, -8, -8, -8, -8, -8 +.short 516, 516, 516, 516, 516, 516, 516, 516 +.short -1544, -1544, -1544, -1544, -1544, -1544, -1544, -1544 +.short -282, -282, -282, -282, -282, -282, -282, -282 +.short 1491, 1491, 1491, 1491, 1491, 1491, 1491, 1491 +.short -1293, -1293, -1293, -1293, -1293, -1293, -1293, -1293 +.short 1015, 1015, 1015, 1015, 1015, 1015, 1015, 1015 +.short -552, -552, -552, -552, -552, -552, -552, -552 +.short 652, 652, 652, 652, 652, 652, 652, 652 +.short 1223, 1223, 1223, 1223, 1223, 1223, 1223, 1223 +.short -1571, -1571, -1571, -1571, -1571, -1571, -1571, -1571 +.short -205, -205, -205, -205, -205, -205, -205, -205 +.short 411, 411, 411, 411, 411, 411, 411, 411 +.short -1542, -1542, -1542, -1542, -1542, -1542, -1542, -1542 +.short 608, 608, 608, 608, 608, 608, 608, 608 +.short 732, 732, 732, 732, 732, 732, 732, 732 +.short 1017, 1017, 1017, 1017, 1017, 1017, 1017, 1017 +.short -681, -681, -681, -681, -681, -681, -681, -681 +.short -130, -130, -130, -130, -130, -130, -130, -130 +.short -1602, -1602, -1602, -1602, -1602, -1602, -1602, -1602 +.short 1458, 1458, 1458, 1458, 1458, 1458, 1458, 1458 +.short -829, -829, -829, -829, -829, -829, -829, -829 +.short 383, 383, 383, 383, 383, 383, 383, 383 +.short 264, 264, 264, 264, 264, 264, 264, 264 +.short -1325, -1325, -1325, -1325, -1325, -1325, -1325, -1325 +.short 573, 573, 573, 573, 573, 573, 573, 573 +.short 1468, 1468, 1468, 1468, 1468, 1468, 1468, 1468 +.short -1474, -1474, -1474, -1474, -1474, -1474, -1474, -1474 +.short -1202, -1202, -1202, -1202, -1202, -1202, -1202, -1202 +.short 962, 962, 962, 962, 962, 962, 962, 962 +.short 182, 182, 182, 182, 182, 182, 182, 182 +.short 1577, 1577, 1577, 1577, 1577, 1577, 1577, 1577 +.short 622, 622, 622, 622, 622, 622, 622, 622 +.short -171, -171, -171, -171, -171, -171, -171, -171 +.short 202, 202, 202, 202, 202, 202, 202, 202 +.short 287, 287, 287, 287, 287, 287, 287, 287 +.short 1422, 1422, 1422, 1422, 1422, 1422, 1422, 1422 +.short 1493, 1493, 1493, 1493, 1493, 1493, 1493, 1493 +.short -1517, -1517, -1517, -1517, -1517, -1517, -1517, -1517 +.short -359, -359, -359, -359, -359, -359, -359, -359 +.short -758, -758, -758, -758, -758, -758, -758, -758 diff --git a/dev/ppc64le/src/ntt_ppc.S b/dev/ppc64le/src/ntt_ppc.S new file mode 100644 index 000000000..172fef9cc --- /dev/null +++ b/dev/ppc64le/src/ntt_ppc.S @@ -0,0 +1,498 @@ +/* + * Copyright (c) The mlkem-native project authors + * SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT + */ + +# +# Copyright 2025- IBM Corp. +# +#=================================================================================== +# Written by Danny Tsen +# + +#include "../../../common.h" + +#define V_QINV 2 +#define V_NMKQ 5 +#define V_Z0 7 +#define V_Z1 8 +#define V_Z2 9 +#define V_Z3 10 +#define V_ZETA 10 + +.machine "any" +.text + +# +# montgomery_reduce +# t = a * QINV +# t = (a - (int32_t)t*_MLKEM_Q) >> 16 +# +#----------------------------------- +# MREDUCE_4X(start, _vz0, _vz1, _vz2, _vz3) +# +.macro MREDUCE_4X start next step _vz0 _vz1 _vz2 _vz3 + mr 9, \start + add 10, 4, 9 # J + len*2 + addi 16, 9, \next + addi 17, 10, \step + addi 18, 16, \next + addi 19, 17, \step + addi 20, 18, \next + addi 21, 19, \step + lxvd2x 32+13, 3, 10 # r[j+len] + lxvd2x 32+18, 3, 17 # r[j+len] + lxvd2x 32+23, 3, 19 # r[j+len] + lxvd2x 32+28, 3, 21 # r[j+len] + xxpermdi 32+13, 32+13, 32+13, 2 + xxpermdi 32+18, 32+18, 32+18, 2 + xxpermdi 32+23, 32+23, 32+23, 2 + xxpermdi 32+28, 32+28, 32+28, 2 + + # fqmul = zeta * coefficient + vmladduhm 15, 13, \_vz0, 3 + vmladduhm 20, 18, \_vz1, 3 + vmladduhm 25, 23, \_vz2, 3 + vmladduhm 30, 28, \_vz3, 3 + + vmhraddshs 14, 13, \_vz0, 3 + vmhraddshs 19, 18, \_vz1, 3 + vmhraddshs 24, 23, \_vz2, 3 + vmhraddshs 29, 28, \_vz3, 3 + + vmladduhm 15, 15, V_QINV, 3 + vmladduhm 20, 20, V_QINV, 3 + vmladduhm 25, 25, V_QINV, 3 + vmladduhm 30, 30, V_QINV, 3 + + vmhraddshs 15, 15, V_NMKQ, 14 + vmhraddshs 20, 20, V_NMKQ, 19 + vmhraddshs 25, 25, V_NMKQ, 24 + vmhraddshs 30, 30, V_NMKQ, 29 + + vsrah 13, 15, 4 # >> 1 + vsrah 18, 20, 4 # >> 1 + vsrah 23, 25, 4 # >> 1 + vsrah 28, 30, 4 # >> 1 + + lxvd2x 32+12, 3, 9 # r[j] + lxvd2x 32+17, 3, 16 # r[j] + lxvd2x 32+22, 3, 18 # r[j] + lxvd2x 32+27, 3, 20 # r[j] + xxpermdi 32+12, 32+12, 32+12, 2 + xxpermdi 32+17, 32+17, 32+17, 2 + xxpermdi 32+22, 32+22, 32+22, 2 + xxpermdi 32+27, 32+27, 32+27, 2 + + vsubuhm 16, 12, 13 # r - t + vadduhm 15, 13, 12 # r + t + vsubuhm 21, 17, 18 # r - t + vadduhm 20, 18, 17 # r + t + vsubuhm 26, 22, 23 # r - t + vadduhm 25, 23, 22 # r + t + vsubuhm 31, 27, 28 # r - t + vadduhm 30, 28, 27 # r + t +.endm + +.macro Write_One + stxvx 32+15, 3, 9 + stxvx 32+16, 3, 10 + stxvx 32+20, 3, 16 + stxvx 32+21, 3, 17 + stxvx 32+25, 3, 18 + stxvx 32+26, 3, 19 + stxvx 32+30, 3, 20 + stxvx 32+31, 3, 21 +.endm + +.macro Write_Two + xxpermdi 32+17, 32+16, 32+15, 3 + xxpermdi 32+22, 32+21, 32+20, 3 + xxpermdi 32+27, 32+26, 32+25, 3 + xxpermdi 32+29, 32+31, 32+30, 3 + + stxvx 32+17, 3, 9 + stxvx 32+22, 3, 16 + stxvx 32+27, 3, 18 + stxvx 32+29, 3, 20 +.endm + +.macro Write_Three + xxmrglw 32+14, 32+16, 32+15 + xxmrghw 32+13, 32+16, 32+15 + xxpermdi 32+17, 32+13, 32+14, 3 + xxmrglw 32+19, 32+21, 32+20 + xxmrghw 32+18, 32+21, 32+20 + xxpermdi 32+22, 32+18, 32+19, 3 + xxmrglw 32+14, 32+26, 32+25 + xxmrghw 32+13, 32+26, 32+25 + xxpermdi 32+27, 32+13, 32+14, 3 + xxmrglw 32+24, 32+31, 32+30 + xxmrghw 32+23, 32+31, 32+30 + xxpermdi 32+29, 32+23, 32+24, 3 + stxvx 32+17, 3, 9 + stxvx 32+22, 3, 16 + stxvx 32+27, 3, 18 + stxvx 32+29, 3, 20 +.endm + +.macro Load_next_4zetas + lxv 32+V_Z0, 0(14) + lxv 32+V_Z1, 16(14) + lxv 32+V_Z2, 32(14) + lxv 32+V_Z3, 48(14) + addi 14, 14, 64 +.endm + +# +# mlk_ntt_ppc(int16_t *r) +# +.global MLK_ASM_NAMESPACE(ntt_ppc) +.align 4 +MLK_ASM_FN_SYMBOL(ntt_ppc) + + stdu 1, -352(1) + mflr 0 + std 14, 56(1) + std 15, 64(1) + std 16, 72(1) + std 17, 80(1) + std 18, 88(1) + std 19, 96(1) + std 20, 104(1) + std 21, 112(1) + stxv 32+20, 128(1) + stxv 32+21, 144(1) + stxv 32+22, 160(1) + stxv 32+23, 176(1) + stxv 32+24, 192(1) + stxv 32+25, 208(1) + stxv 32+26, 224(1) + stxv 32+27, 240(1) + stxv 32+28, 256(1) + stxv 32+29, 272(1) + stxv 32+30, 288(1) + stxv 32+31, 304(1) + + # get MLKEM_Q + addis 8,2,.nmkq@toc@ha + addi 8,8,.nmkq@toc@l + lvx V_NMKQ,0,8 + + # zetas array + addis 14,2,.K1@toc@ha + addi 14,14,.K1@toc@l + + vxor 3, 3, 3 + vspltish 4, 1 + lxv 32+V_QINV, 16(8) + +.align 4 +__Len128: + # + # Compute coefficients of the NTT based on the following loop. + # for (len = 128; len ≥ 2; len = len/2) + # + # 1. len = 128, start = 0 + # + li 5, 0 # start + li 4, 256 # len * 2 + lvx V_ZETA, 0, 14 + addi 14, 14, 16 + + MREDUCE_4X 5, 16, 16, V_ZETA, V_ZETA, V_ZETA, V_ZETA + Write_One + li 5, 64 + MREDUCE_4X 5, 16, 16, V_ZETA, V_ZETA, V_ZETA, V_ZETA + Write_One + li 5, 128 + MREDUCE_4X 5, 16, 16, V_ZETA, V_ZETA, V_ZETA, V_ZETA + Write_One + li 5, 192 + MREDUCE_4X 5, 16, 16, V_ZETA, V_ZETA, V_ZETA, V_ZETA + Write_One + +.align 4 +__Len64: + # + # 2. len = 64, start = 0, 128 + # k += 2 + li 5, 0 + li 4, 128 + lvx V_ZETA, 0, 14 + addi 14, 14, 16 + MREDUCE_4X 5, 16, 16, V_ZETA, V_ZETA, V_ZETA, V_ZETA + Write_One + li 5, 64 + MREDUCE_4X 5, 16, 16, V_ZETA, V_ZETA, V_ZETA, V_ZETA + Write_One + li 5, 256 + + lvx V_ZETA, 0, 14 + addi 14, 14, 16 + MREDUCE_4X 5, 16, 16, V_ZETA, V_ZETA, V_ZETA, V_ZETA + Write_One + li 5, 320 + MREDUCE_4X 5, 16, 16, V_ZETA, V_ZETA, V_ZETA, V_ZETA + Write_One + +.align 4 +__Len32: + # + # 3. len = 32, start = 0, 64, 128, 192 + # k += 4 + li 5, 0 + li 4, 64 + lvx V_ZETA, 0, 14 + addi 14, 14, 16 + MREDUCE_4X 5, 16, 16, V_ZETA, V_ZETA, V_ZETA, V_ZETA + Write_One + #li 5, 64 + li 5, 128 + + lvx V_ZETA, 0, 14 + addi 14, 14, 16 + MREDUCE_4X 5, 16, 16, V_ZETA, V_ZETA, V_ZETA, V_ZETA + Write_One + #li 5, 128 + li 5, 256 + + lvx V_ZETA, 0, 14 + addi 14, 14, 16 + MREDUCE_4X 5, 16, 16, V_ZETA, V_ZETA, V_ZETA, V_ZETA + Write_One + #li 5, 192 + li 5, 384 + + lvx V_ZETA, 0, 14 + addi 14, 14, 16 + MREDUCE_4X 5, 16, 16, V_ZETA, V_ZETA, V_ZETA, V_ZETA + Write_One + +.align 4 +__Len16: + # + # 4. len = 16, start = 0, 32, 64,,...160, 192, 224 + # k += 8 + li 5, 0 + li 4, 32 + Load_next_4zetas + MREDUCE_4X 5, 64, 64, V_Z0, V_Z1, V_Z2, V_Z3 + Write_One + li 5, 16 + MREDUCE_4X 5, 64, 64, V_Z0, V_Z1, V_Z2, V_Z3 + Write_One + + Load_next_4zetas + li 5, 256 + MREDUCE_4X 5, 64, 64, V_Z0, V_Z1, V_Z2, V_Z3 + Write_One + li 5, 272 + MREDUCE_4X 5, 64, 64, V_Z0, V_Z1, V_Z2, V_Z3 + Write_One + +.align 4 +__Len8: + # + # 5. len = 8, start = 0, 16, 32, 48,...208, 224, 240 + # k += 16 + li 5, 0 + li 4, 16 + Load_next_4zetas + MREDUCE_4X 5, 32, 32, V_Z0, V_Z1, V_Z2, V_Z3 + Write_One + li 5, 128 + + Load_next_4zetas + MREDUCE_4X 5, 32, 32, V_Z0, V_Z1, V_Z2, V_Z3 + Write_One + li 5, 256 + + Load_next_4zetas + MREDUCE_4X 5, 32, 32, V_Z0, V_Z1, V_Z2, V_Z3 + Write_One + li 5, 384 + + Load_next_4zetas + MREDUCE_4X 5, 32, 32, V_Z0, V_Z1, V_Z2, V_Z3 + Write_One + + # + # 6. len = 4, start = 0, 8, 16, 24,...232, 240, 248 + # k += 32 + li 15, 4 # loops + mtctr 15 + li 5, 0 + li 4, 8 +.align 4 +__Len4: + Load_next_4zetas + MREDUCE_4X 5, 16, 16, V_Z0, V_Z1, V_Z2, V_Z3 + Write_Two + addi 5, 5, 64 + + Load_next_4zetas + MREDUCE_4X 5, 16, 16, V_Z0, V_Z1, V_Z2, V_Z3 + Write_Two + addi 5, 5, 64 + + bdnz __Len4 + + # + # 7. len = 2, start = 0, 4, 8, 12,...244, 248, 252 + # k += 64 + # Update zetas vectors, each vector has 2 zetas + addis 14,2,.K64@toc@ha + addi 14,14,.K64@toc@l + + li 15, 4 + mtctr 15 + li 5, 0 + li 4, 4 +.align 4 +__Len2: + Load_next_4zetas + MREDUCE_4X 5, 16, 16, V_Z0, V_Z1, V_Z2, V_Z3 + Write_Three + addi 5, 5, 64 + + Load_next_4zetas + MREDUCE_4X 5, 16, 16, V_Z0, V_Z1, V_Z2, V_Z3 + Write_Three + addi 5, 5, 64 + + bdnz __Len2 + +__ntt_out: + lxv 32+20, 128(1) + lxv 32+21, 144(1) + lxv 32+22, 160(1) + lxv 32+23, 176(1) + lxv 32+24, 192(1) + lxv 32+25, 208(1) + lxv 32+26, 224(1) + lxv 32+27, 240(1) + lxv 32+28, 256(1) + lxv 32+29, 272(1) + lxv 32+30, 288(1) + lxv 32+31, 304(1) + ld 14, 56(1) + ld 15, 64(1) + ld 16, 72(1) + ld 16, 72(1) + ld 17, 80(1) + ld 18, 88(1) + ld 19, 96(1) + ld 20, 104(1) + ld 21, 112(1) + + mtlr 0 + addi 1, 1, 352 + blr + +.data +.align 4 +# -MLKEM_Q +.nmkq: +.short -3329, -3329, -3329, -3329, -3329, -3329, -3329, -3329 +# QINV +.short -3327, -3327, -3327, -3327, -3327, -3327, -3327, -3327 + +# zetas +.K1: +.short -758, -758, -758, -758, -758, -758, -758, -758 +.short -359, -359, -359, -359, -359, -359, -359, -359 +.short -1517, -1517, -1517, -1517, -1517, -1517, -1517, -1517 +.short 1493, 1493, 1493, 1493, 1493, 1493, 1493, 1493 +.short 1422, 1422, 1422, 1422, 1422, 1422, 1422, 1422 +.short 287, 287, 287, 287, 287, 287, 287, 287 +.short 202, 202, 202, 202, 202, 202, 202, 202 +.short -171, -171, -171, -171, -171, -171, -171, -171 +.short 622, 622, 622, 622, 622, 622, 622, 622 +.short 1577, 1577, 1577, 1577, 1577, 1577, 1577, 1577 +.short 182, 182, 182, 182, 182, 182, 182, 182 +.short 962, 962, 962, 962, 962, 962, 962, 962 +.short -1202, -1202, -1202, -1202, -1202, -1202, -1202, -1202 +.short -1474, -1474, -1474, -1474, -1474, -1474, -1474, -1474 +.short 1468, 1468, 1468, 1468, 1468, 1468, 1468, 1468 +.short 573, 573, 573, 573, 573, 573, 573, 573 +.short -1325, -1325, -1325, -1325, -1325, -1325, -1325, -1325 +.short 264, 264, 264, 264, 264, 264, 264, 264 +.short 383, 383, 383, 383, 383, 383, 383, 383 +.short -829, -829, -829, -829, -829, -829, -829, -829 +.short 1458, 1458, 1458, 1458, 1458, 1458, 1458, 1458 +.short -1602, -1602, -1602, -1602, -1602, -1602, -1602, -1602 +.short -130, -130, -130, -130, -130, -130, -130, -130 +.short -681, -681, -681, -681, -681, -681, -681, -681 +.short 1017, 1017, 1017, 1017, 1017, 1017, 1017, 1017 +.short 732, 732, 732, 732, 732, 732, 732, 732 +.short 608, 608, 608, 608, 608, 608, 608, 608 +.short -1542, -1542, -1542, -1542, -1542, -1542, -1542, -1542 +.short 411, 411, 411, 411, 411, 411, 411, 411 +.short -205, -205, -205, -205, -205, -205, -205, -205 +.short -1571, -1571, -1571, -1571, -1571, -1571, -1571, -1571 +.short 1223, 1223, 1223, 1223, 1223, 1223, 1223, 1223 +.short 652, 652, 652, 652, 652, 652, 652, 652 +.short -552, -552, -552, -552, -552, -552, -552, -552 +.short 1015, 1015, 1015, 1015, 1015, 1015, 1015, 1015 +.short -1293, -1293, -1293, -1293, -1293, -1293, -1293, -1293 +.short 1491, 1491, 1491, 1491, 1491, 1491, 1491, 1491 +.short -282, -282, -282, -282, -282, -282, -282, -282 +.short -1544, -1544, -1544, -1544, -1544, -1544, -1544, -1544 +.short 516, 516, 516, 516, 516, 516, 516, 516 +.short -8, -8, -8, -8, -8, -8, -8, -8 +.short -320, -320, -320, -320, -320, -320, -320, -320 +.short -666, -666, -666, -666, -666, -666, -666, -666 +.short -1618, -1618, -1618, -1618, -1618, -1618, -1618, -1618 +.short -1162, -1162, -1162, -1162, -1162, -1162, -1162, -1162 +.short 126, 126, 126, 126, 126, 126, 126, 126 +.short 1469, 1469, 1469, 1469, 1469, 1469, 1469, 1469 +.short -853, -853, -853, -853, -853, -853, -853, -853 +.short -90, -90, -90, -90, -90, -90, -90, -90 +.short -271, -271, -271, -271, -271, -271, -271, -271 +.short 830, 830, 830, 830, 830, 830, 830, 830 +.short 107, 107, 107, 107, 107, 107, 107, 107 +.short -1421, -1421, -1421, -1421, -1421, -1421, -1421, -1421 +.short -247, -247, -247, -247, -247, -247, -247, -247 +.short -951, -951, -951, -951, -951, -951, -951, -951 +.short -398, -398, -398, -398, -398, -398, -398, -398 +.short 961, 961, 961, 961, 961, 961, 961, 961 +.short -1508, -1508, -1508, -1508, -1508, -1508, -1508, -1508 +.short -725, -725, -725, -725, -725, -725, -725, -725 +.short 448, 448, 448, 448, 448, 448, 448, 448 +.short -1065, -1065, -1065, -1065, -1065, -1065, -1065, -1065 +.short 677, 677, 677, 677, 677, 677, 677, 677 +.short -1275, -1275, -1275, -1275, -1275, -1275, -1275, -1275 +.K64: +.short -1103, -1103, -1103, -1103, 430, 430, 430, 430 +.short 555, 555, 555, 555, 843, 843, 843, 843 +.short -1251, -1251, -1251, -1251, 871, 871, 871, 871 +.short 1550, 1550, 1550, 1550, 105, 105, 105, 105 +.short 422, 422, 422, 422, 587, 587, 587, 587 +.short 177, 177, 177, 177, -235, -235, -235, -235 +.short -291, -291, -291, -291, -460, -460, -460, -460 +.short 1574, 1574, 1574, 1574, 1653, 1653, 1653, 1653 +.short -246, -246, -246, -246, 778, 778, 778, 778 +.short 1159, 1159, 1159, 1159, -147, -147, -147, -147 +.short -777, -777, -777, -777, 1483, 1483, 1483, 1483 +.short -602, -602, -602, -602, 1119, 1119, 1119, 1119 +.short -1590, -1590, -1590, -1590, 644, 644, 644, 644 +.short -872, -872, -872, -872, 349, 349, 349, 349 +.short 418, 418, 418, 418, 329, 329, 329, 329 +.short -156, -156, -156, -156, -75, -75, -75, -75 +.short 817, 817, 817, 817, 1097, 1097, 1097, 1097 +.short 603, 603, 603, 603, 610, 610, 610, 610 +.short 1322, 1322, 1322, 1322, -1285, -1285, -1285, -1285 +.short -1465, -1465, -1465, -1465, 384, 384, 384, 384 +.short -1215, -1215, -1215, -1215, -136, -136, -136, -136 +.short 1218, 1218, 1218, 1218, -1335, -1335, -1335, -1335 +.short -874, -874, -874, -874, 220, 220, 220, 220 +.short -1187, -1187, -1187, -1187, -1659, -1659, -1659, -1659 +.short -1185, -1185, -1185, -1185, -1530, -1530, -1530, -1530 +.short -1278, -1278, -1278, -1278, 794, 794, 794, 794 +.short -1510, -1510, -1510, -1510, -854, -854, -854, -854 +.short -870, -870, -870, -870, 478, 478, 478, 478 +.short -108, -108, -108, -108, -308, -308, -308, -308 +.short 996, 996, 996, 996, 991, 991, 991, 991 +.short 958, 958, 958, 958, -1460, -1460, -1460, -1460 +.short 1522, 1522, 1522, 1522, 1628, 1628, 1628, 1628 diff --git a/dev/ppc64le/src/poly_tomont.S b/dev/ppc64le/src/poly_tomont.S new file mode 100644 index 000000000..c07f25c5a --- /dev/null +++ b/dev/ppc64le/src/poly_tomont.S @@ -0,0 +1,163 @@ +/* + * Copyright (c) The mlkem-native project authors + * SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT + */ + +# +# Copyright 2025- IBM Corp. +# +#=================================================================================== +# Written by Danny Tsen +# + +# Poly_tomont: Inplace conversion of all coefficients of a polynomial +# from normal domain to Montgomery domain +# +# Arguments:*r: pointer to input/output polynomial +# + +#include "../../../common.h" + +#define V1353 0 +#define V_QINV 2 +#define V_NMKQ 5 + +.machine "any" +.text + +# +# montgomery_reduce +# t = a * QINV +# t = (a - (int32_t)t*_MLKEM_Q) >> 16 +# +#----------------------------------- +# MREDUCE_4X(_v0, _v1, _v2, _v3) +# +.macro MREDUCE_4X _v0 _v1 _v2 _v3 + lxvd2x 32+13, 0, 3 + addi 3, 3, 16 + lxvd2x 32+18, 0, 3 + addi 3, 3, 16 + lxvd2x 32+23, 0, 3 + addi 3, 3, 16 + lxvd2x 32+7, 0, 3 + addi 3, 3, 16 + + vmladduhm 15, 13, V1353, 3 + vmladduhm 20, 18, V1353, 3 + vmladduhm 25, 23, V1353, 3 + vmladduhm 9, 7, V1353, 3 + + vmhraddshs 14, 13, V1353, 3 + vmhraddshs 19, 18, V1353, 3 + vmhraddshs 24, 23, V1353, 3 + vmhraddshs 8, 7, V1353, 3 + + vmladduhm 15, 15, V_QINV, 3 + vmladduhm 20, 20, V_QINV, 3 + vmladduhm 25, 25, V_QINV, 3 + vmladduhm 9, 9, V_QINV, 3 + + vmhraddshs 15, 15, V_NMKQ, 14 + vmhraddshs 20, 20, V_NMKQ, 19 + vmhraddshs 25, 25, V_NMKQ, 24 + vmhraddshs 9, 9, V_NMKQ, 8 + + vsrah \_v0, 15, 4 # >> 1 + vsrah \_v1, 20, 4 # >> 1 + vsrah \_v2, 25, 4 # >> 1 + vsrah \_v3, 9, 4 # >> 1 +.endm + +.macro Write_8X + stxvd2x 32+27, 4, 3 + stxvd2x 32+28, 5, 3 + stxvd2x 32+29, 6, 3 + stxvd2x 32+30, 7, 3 + stxvd2x 32+13, 8, 3 + stxvd2x 32+18, 9, 3 + stxvd2x 32+23, 10, 3 + stxvd2x 32+7, 11, 3 +.endm + +.align 4 +.globl MLK_ASM_NAMESPACE(poly_tomont_ppc) +MLK_ASM_FN_SYMBOL(poly_tomont_ppc) + stdu 1, -320(1) + mflr 0 + + stxv 32+20, 128(1) + stxv 32+21, 144(1) + stxv 32+22, 160(1) + stxv 32+23, 176(1) + stxv 32+24, 192(1) + stxv 32+25, 208(1) + stxv 32+26, 224(1) + stxv 32+27, 240(1) + stxv 32+28, 256(1) + stxv 32+29, 272(1) + stxv 32+30, 288(1) + + addis 9,2,.nmkq@toc@ha + addi 9,9,.nmkq@toc@l + addis 10,2,.C1353@toc@ha + addi 10,10,.C1353@toc@l + + lxv 32+V_NMKQ,0(9) + lxv 32+V_QINV,16(9) + lxv 32+V1353,0(10) + + vxor 3, 3, 3 + vspltish 4, 1 + + li 4, -128 + li 5, -112 + li 6, -96 + li 7, -80 + li 8, -64 + li 9, -48 + li 10, -32 + li 11, -16 + + MREDUCE_4X 27, 28, 29, 30 + MREDUCE_4X 13, 18, 23, 7 + Write_8X + + MREDUCE_4X 27, 28, 29, 30 + MREDUCE_4X 13, 18, 23, 7 + Write_8X + + MREDUCE_4X 27, 28, 29, 30 + MREDUCE_4X 13, 18, 23, 7 + Write_8X + + MREDUCE_4X 27, 28, 29, 30 + MREDUCE_4X 13, 18, 23, 7 + Write_8X + + lxv 32+20, 128(1) + lxv 32+21, 144(1) + lxv 32+22, 160(1) + lxv 32+23, 176(1) + lxv 32+24, 192(1) + lxv 32+25, 208(1) + lxv 32+26, 224(1) + lxv 32+27, 240(1) + lxv 32+28, 256(1) + lxv 32+29, 272(1) + lxv 32+30, 288(1) + mtlr 0 + addi 1, 1, 320 + blr + +.data +.align 4 +# -MLKEM_Q +.nmkq: +.short -3329, -3329, -3329, -3329, -3329, -3329, -3329, -3329 +# QINV +.short -3327, -3327, -3327, -3327, -3327, -3327, -3327, -3327 + +.C1353: +.short 1353, 1353, 1353, 1353, 1353, 1353, 1353, 1353 + diff --git a/dev/ppc64le/src/reduce.S b/dev/ppc64le/src/reduce.S new file mode 100644 index 000000000..ee8e1fdca --- /dev/null +++ b/dev/ppc64le/src/reduce.S @@ -0,0 +1,225 @@ +/* + * Copyright (c) The mlkem-native project authors + * SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT + */ + +# +# Copyright 2025- IBM Corp. +# +#=================================================================================== +# Written by Danny Tsen +# + +# +# poly_reduce: Applies Barrett reduction to all coefficients of a polynomial +# for details of the Barrett reduction +# +# Arguments: *r: pointer to input/output polynomial +# + +#include "../../../common.h" + +# Barrett reduce constatnts +#define V20159 0 +#define V_25 1 +#define V_26 2 +#define V_MKQ 3 + +.machine "any" +.text + +.macro BREDUCE_4X _v0 _v1 _v2 _v3 + lxvd2x 32+8, 0, 3 + lxvd2x 32+12, 14, 3 + lxvd2x 32+16, 15, 3 + lxvd2x 32+20, 16, 3 + addi 3, 3, 64 + vmulosh 6, 8, V20159 + vmulesh 5, 8, V20159 + vmulosh 11, 12, V20159 + vmulesh 10, 12, V20159 + vmulosh 15, 16, V20159 + vmulesh 14, 16, V20159 + vmulosh 19, 20, V20159 + vmulesh 18, 20, V20159 + xxmrglw 32+4, 32+5, 32+6 + xxmrghw 32+5, 32+5, 32+6 + xxmrglw 32+9, 32+10, 32+11 + xxmrghw 32+10, 32+10, 32+11 + xxmrglw 32+13, 32+14, 32+15 + xxmrghw 32+14, 32+14, 32+15 + xxmrglw 32+17, 32+18, 32+19 + xxmrghw 32+18, 32+18, 32+19 + vadduwm 4, 4, V_25 + vadduwm 5, 5, V_25 + vadduwm 9, 9, V_25 + vadduwm 10, 10, V_25 + vadduwm 13, 13, V_25 + vadduwm 14, 14, V_25 + vadduwm 17, 17, V_25 + vadduwm 18, 18, V_25 + vsraw 4, 4, V_26 + vsraw 5, 5, V_26 + vsraw 9, 9, V_26 + vsraw 10, 10, V_26 + vsraw 13, 13, V_26 + vsraw 14, 14, V_26 + vsraw 17, 17, V_26 + vsraw 18, 18, V_26 + vpkuwum 4, 5, 4 + vsubuhm 4, 7, 4 + vpkuwum 9, 10, 9 + vsubuhm 9, 7, 9 + vpkuwum 13, 14, 13 + vsubuhm 13, 7, 13 + vpkuwum 17, 18, 17 + vsubuhm 17, 7, 17 + vmladduhm \_v0, 4, V_MKQ, 8 + vmladduhm \_v1, 9, V_MKQ, 12 + vmladduhm \_v2, 13, V_MKQ, 16 + vmladduhm \_v3, 17, V_MKQ, 20 +.endm + +.macro Write_8X + stxvd2x 32+21, 4, 3 + stxvd2x 32+22, 5, 3 + stxvd2x 32+23, 6, 3 + stxvd2x 32+24, 7, 3 + stxvd2x 32+4, 8, 3 + stxvd2x 32+9, 9, 3 + stxvd2x 32+13, 10, 3 + stxvd2x 32+17, 11, 3 +.endm + +# +# Conditional addition to get unsigned canonical representative +# +.macro To_unsigned_16 + lxv 32+12, 0(3) + lxv 32+13, 16(3) + lxv 32+14, 32(3) + lxv 32+15, 48(3) + addi 3, 3, 64 + vsrh 1, 12, 10 + vsrh 0, 13, 10 + vsrh 3, 14, 10 + vsrh 2, 15, 10 + vadduhm 7, 12, 11 + vadduhm 8, 13, 11 + vadduhm 5, 14, 11 + vadduhm 6, 15, 11 + vcmpequh 1, 1, 9 + vcmpequh 0, 0, 9 + vcmpequh 3, 3, 9 + vcmpequh 2, 2, 9 + xxsel 32+1, 32+7,32+12, 32+1 + xxsel 32+0, 32+8,32+13, 32+0 + xxsel 32+3, 32+5,32+14, 32+3 + xxsel 32+2, 32+6,32+15, 32+2 + stxv 32+3, -32(3) + stxv 32+2, -16(3) + stxv 32+1, -64(3) + stxv 32+0, -48(3) +.endm + +.align 4 +.globl MLK_ASM_NAMESPACE(reduce_ppc) +MLK_ASM_FN_SYMBOL(reduce_ppc) + stdu 1, -224(1) + mflr 0 + std 14, 96(1) + std 15, 104(1) + std 16, 112(1) + stxv 32+20, 128(1) + stxv 32+21, 144(1) + stxv 32+22, 160(1) + stxv 32+23, 176(1) + stxv 32+24, 192(1) + + addis 8,2,.mkq@toc@ha + addi 8,8,.mkq@toc@l + addis 9,2,.C20159@toc@ha + addi 9,9,.C20159@toc@l + addis 10,2,.C25@toc@ha + addi 10,10,.C25@toc@l + + vxor 7, 7, 7 + + lxv 32+V_MKQ, 0(8) + lxv 32+V20159, 0(9) + lxv 32+V_25, 0(10) + + li 4, -128 + li 5, -112 + li 6, -96 + li 7, -80 + li 8, -64 + li 9, -48 + li 10, -32 + li 11, -16 + + li 14, 16 + li 15, 32 + li 16, 48 + + vspltisw V_26, 13 + vadduwm V_26, V_26, V_26 + + BREDUCE_4X 21, 22, 23, 24 + BREDUCE_4X 4, 9, 13, 17 + Write_8X + + BREDUCE_4X 21, 22, 23, 24 + BREDUCE_4X 4, 9, 13, 17 + Write_8X + + BREDUCE_4X 21, 22, 23, 24 + BREDUCE_4X 4, 9, 13, 17 + Write_8X + + BREDUCE_4X 21, 22, 23, 24 + BREDUCE_4X 4, 9, 13, 17 + Write_8X + + # + # To unsigned canonical + # +.align 4 + addi 3, 3, -512 + xxspltib 32+9 ,0 + vspltish 10, 15 + vmr 11, V_MKQ + + To_unsigned_16 + To_unsigned_16 + To_unsigned_16 + To_unsigned_16 + To_unsigned_16 + To_unsigned_16 + To_unsigned_16 + To_unsigned_16 + + ld 14, 96(1) + ld 15, 104(1) + ld 16, 112(1) + lxv 32+20, 128(1) + lxv 32+21, 144(1) + lxv 32+22, 160(1) + lxv 32+23, 176(1) + lxv 32+24, 192(1) + mtlr 0 + addi 1, 1, 224 + blr + +.align 4 +.data +# MLKEM_Q +.mkq: +.short 3329, 3329, 3329, 3329, 3329, 3329, 3329, 3329 + +.C20159: +.short 20159, 20159, 20159, 20159, 20159, 20159, 20159, 20159 + +# 0x2000000 +.C25: +.long 33554432, 33554432, 33554432, 33554432 diff --git a/integration/liboqs/ML-KEM-1024_META.yml b/integration/liboqs/ML-KEM-1024_META.yml index 7d8e50d4c..766c936e2 100644 --- a/integration/liboqs/ML-KEM-1024_META.yml +++ b/integration/liboqs/ML-KEM-1024_META.yml @@ -9,83 +9,74 @@ length-ciphertext: 1568 length-secret-key: 3168 length-shared-secret: 32 length-keypair-seed: 64 -length-encaps-seed: 32 nistkat-sha256: f580d851e5fb27e6876e5e203fa18be4cdbfd49e05d48fec3d3992c8f43a13e6 testvectors-sha256: ff1a854b9b6761a70c65ccae85246fe0596a949e72eae0866a8a2a2d4ea54b10 principal-submitters: -- Peter Schwabe + - Peter Schwabe auxiliary-submitters: -- Roberto Avanzi -- Joppe Bos -- Léo Ducas -- Eike Kiltz -- Tancrède Lepoint -- Vadim Lyubashevsky -- John M. Schanck -- Gregor Seiler -- Damien Stehlé + - Roberto Avanzi + - Joppe Bos + - Léo Ducas + - Eike Kiltz + - Tancrède Lepoint + - Vadim Lyubashevsky + - John M. Schanck + - Gregor Seiler + - Damien Stehlé implementations: -- name: ref - version: FIPS203 - folder_name: . - compile_opts: -DMLK_CONFIG_PARAMETER_SET=1024 -DMLK_CONFIG_FILE="../../integration/liboqs/config_c.h" - signature_keypair: PQCP_MLKEM_NATIVE_MLKEM1024_C_keypair - signature_keypair_derand: PQCP_MLKEM_NATIVE_MLKEM1024_C_keypair_derand - signature_enc: PQCP_MLKEM_NATIVE_MLKEM1024_C_enc - signature_enc_derand: PQCP_MLKEM_NATIVE_MLKEM1024_C_enc_derand - signature_dec: PQCP_MLKEM_NATIVE_MLKEM1024_C_dec - sources: integration/liboqs/config_c.h integration/liboqs/fips202_glue.h integration/liboqs/fips202x4_glue.h - mlkem/src/cbmc.h mlkem/src/common.h mlkem/src/compress.c mlkem/src/compress.h - mlkem/src/debug.c mlkem/src/debug.h mlkem/src/indcpa.c mlkem/src/indcpa.h mlkem/src/kem.c - mlkem/src/kem.h mlkem/src/native/api.h mlkem/src/native/meta.h mlkem/src/params.h - mlkem/src/poly.c mlkem/src/poly.h mlkem/src/poly_k.c mlkem/src/poly_k.h mlkem/src/randombytes.h - mlkem/src/sampling.c mlkem/src/sampling.h mlkem/src/symmetric.h mlkem/src/sys.h - mlkem/src/verify.c mlkem/src/verify.h mlkem/src/zetas.inc -- name: x86_64 - version: FIPS203 - folder_name: . - compile_opts: -DMLK_CONFIG_PARAMETER_SET=1024 -DMLK_CONFIG_FILE="../../integration/liboqs/config_x86_64.h" - signature_keypair: PQCP_MLKEM_NATIVE_MLKEM1024_X86_64_keypair - signature_keypair_derand: PQCP_MLKEM_NATIVE_MLKEM1024_X86_64_keypair_derand - signature_enc: PQCP_MLKEM_NATIVE_MLKEM1024_X86_64_enc - signature_enc_derand: PQCP_MLKEM_NATIVE_MLKEM1024_X86_64_enc_derand - signature_dec: PQCP_MLKEM_NATIVE_MLKEM1024_X86_64_dec - sources: integration/liboqs/config_x86_64.h integration/liboqs/fips202_glue.h integration/liboqs/fips202x4_glue.h - mlkem/src/cbmc.h mlkem/src/common.h mlkem/src/compress.c mlkem/src/compress.h - mlkem/src/debug.c mlkem/src/debug.h mlkem/src/indcpa.c mlkem/src/indcpa.h mlkem/src/kem.c - mlkem/src/kem.h mlkem/src/native/api.h mlkem/src/native/meta.h mlkem/src/params.h - mlkem/src/poly.c mlkem/src/poly.h mlkem/src/poly_k.c mlkem/src/poly_k.h mlkem/src/randombytes.h - mlkem/src/sampling.c mlkem/src/sampling.h mlkem/src/symmetric.h mlkem/src/sys.h - mlkem/src/verify.c mlkem/src/verify.h mlkem/src/zetas.inc mlkem/src/native/x86_64 - supported_platforms: - - architecture: x86_64 - operating_systems: - - Linux - - Darwin - required_flags: - - avx2 - - bmi2 - - popcnt -- name: aarch64 - version: FIPS203 - folder_name: . - compile_opts: -DMLK_CONFIG_PARAMETER_SET=1024 -DMLK_CONFIG_FILE="../../integration/liboqs/config_aarch64.h" - signature_keypair: PQCP_MLKEM_NATIVE_MLKEM1024_AARCH64_keypair - signature_keypair_derand: PQCP_MLKEM_NATIVE_MLKEM1024_AARCH64_keypair_derand - signature_enc: PQCP_MLKEM_NATIVE_MLKEM1024_AARCH64_enc - signature_enc_derand: PQCP_MLKEM_NATIVE_MLKEM1024_AARCH64_enc_derand - signature_dec: PQCP_MLKEM_NATIVE_MLKEM1024_AARCH64_dec - sources: integration/liboqs/config_aarch64.h integration/liboqs/fips202_glue.h integration/liboqs/fips202x4_glue.h - mlkem/src/cbmc.h mlkem/src/common.h mlkem/src/compress.c mlkem/src/compress.h - mlkem/src/debug.c mlkem/src/debug.h mlkem/src/indcpa.c mlkem/src/indcpa.h mlkem/src/kem.c - mlkem/src/kem.h mlkem/src/native/api.h mlkem/src/native/meta.h mlkem/src/params.h - mlkem/src/poly.c mlkem/src/poly.h mlkem/src/poly_k.c mlkem/src/poly_k.h mlkem/src/randombytes.h - mlkem/src/sampling.c mlkem/src/sampling.h mlkem/src/symmetric.h mlkem/src/sys.h - mlkem/src/verify.c mlkem/src/verify.h mlkem/src/zetas.inc mlkem/src/native/aarch64 - supported_platforms: - - architecture: arm_8 - operating_systems: - - Linux - - Darwin - required_flags: - - asimd + - name: ref + version: FIPS203 + folder_name: . + compile_opts: -DMLK_CONFIG_PARAMETER_SET=1024 -DMLK_CONFIG_FILE="../../integration/liboqs/config_c.h" + signature_keypair: PQCP_MLKEM_NATIVE_MLKEM1024_C_keypair + signature_keypair_derand: PQCP_MLKEM_NATIVE_MLKEM1024_C_keypair_derand + signature_enc: PQCP_MLKEM_NATIVE_MLKEM1024_C_enc + signature_dec: PQCP_MLKEM_NATIVE_MLKEM1024_C_dec + sources: integration/liboqs/config_c.h integration/liboqs/fips202_glue.h integration/liboqs/fips202x4_glue.h mlkem/src/cbmc.h mlkem/src/common.h mlkem/src/compress.c mlkem/src/compress.h mlkem/src/debug.c mlkem/src/debug.h mlkem/src/indcpa.c mlkem/src/indcpa.h mlkem/src/kem.c mlkem/src/kem.h mlkem/src/native/api.h mlkem/src/native/meta.h mlkem/src/params.h mlkem/src/poly.c mlkem/src/poly.h mlkem/src/poly_k.c mlkem/src/poly_k.h mlkem/src/randombytes.h mlkem/src/sampling.c mlkem/src/sampling.h mlkem/src/symmetric.h mlkem/src/sys.h mlkem/src/verify.c mlkem/src/verify.h mlkem/src/zetas.inc + - name: x86_64 + version: FIPS203 + folder_name: . + compile_opts: -DMLK_CONFIG_PARAMETER_SET=1024 -DMLK_CONFIG_FILE="../../integration/liboqs/config_x86_64.h" + signature_keypair: PQCP_MLKEM_NATIVE_MLKEM1024_X86_64_keypair + signature_keypair_derand: PQCP_MLKEM_NATIVE_MLKEM1024_X86_64_keypair_derand + signature_enc: PQCP_MLKEM_NATIVE_MLKEM1024_X86_64_enc + signature_dec: PQCP_MLKEM_NATIVE_MLKEM1024_X86_64_dec + sources: integration/liboqs/config_x86_64.h integration/liboqs/fips202_glue.h integration/liboqs/fips202x4_glue.h mlkem/src/cbmc.h mlkem/src/common.h mlkem/src/compress.c mlkem/src/compress.h mlkem/src/debug.c mlkem/src/debug.h mlkem/src/indcpa.c mlkem/src/indcpa.h mlkem/src/kem.c mlkem/src/kem.h mlkem/src/native/api.h mlkem/src/native/meta.h mlkem/src/native/x86_64 mlkem/src/params.h mlkem/src/poly.c mlkem/src/poly.h mlkem/src/poly_k.c mlkem/src/poly_k.h mlkem/src/randombytes.h mlkem/src/sampling.c mlkem/src/sampling.h mlkem/src/symmetric.h mlkem/src/sys.h mlkem/src/verify.c mlkem/src/verify.h mlkem/src/zetas.inc + supported_platforms: + - architecture: x86_64 + operating_systems: + - Linux + - Darwin + required_flags: + - avx2 + - bmi2 + - popcnt + - name: aarch64 + version: FIPS203 + folder_name: . + compile_opts: -DMLK_CONFIG_PARAMETER_SET=1024 -DMLK_CONFIG_FILE="../../integration/liboqs/config_aarch64.h" + signature_keypair: PQCP_MLKEM_NATIVE_MLKEM1024_AARCH64_keypair + signature_keypair_derand: PQCP_MLKEM_NATIVE_MLKEM1024_AARCH64_keypair_derand + signature_enc: PQCP_MLKEM_NATIVE_MLKEM1024_AARCH64_enc + signature_dec: PQCP_MLKEM_NATIVE_MLKEM1024_AARCH64_dec + sources: integration/liboqs/config_aarch64.h integration/liboqs/fips202_glue.h integration/liboqs/fips202x4_glue.h mlkem/src/cbmc.h mlkem/src/common.h mlkem/src/compress.c mlkem/src/compress.h mlkem/src/debug.c mlkem/src/debug.h mlkem/src/indcpa.c mlkem/src/indcpa.h mlkem/src/kem.c mlkem/src/kem.h mlkem/src/native/api.h mlkem/src/native/meta.h mlkem/src/native/aarch64 mlkem/src/params.h mlkem/src/poly.c mlkem/src/poly.h mlkem/src/poly_k.c mlkem/src/poly_k.h mlkem/src/randombytes.h mlkem/src/sampling.c mlkem/src/sampling.h mlkem/src/symmetric.h mlkem/src/sys.h mlkem/src/verify.c mlkem/src/verify.h mlkem/src/zetas.inc + supported_platforms: + - architecture: arm_8 + operating_systems: + - Linux + - Darwin + required_flags: + - asimd + - name: ppc64le + version: FIPS203 + folder_name: . + compile_opts: -DMLK_CONFIG_PARAMETER_SET=1024 -DMLK_CONFIG_FILE="../../integration/liboqs/config_ppc64le.h" + signature_keypair: PQCP_MLKEM_NATIVE_MLKEM1024_PPC64LE_keypair + signature_keypair_derand: PQCP_MLKEM_NATIVE_MLKEM1024_PPC64LE_keypair_derand + signature_enc: PQCP_MLKEM_NATIVE_MLKEM1024_PPC64LE_enc + signature_dec: PQCP_MLKEM_NATIVE_MLKEM1024_PPC64LE_dec + sources: integration/liboqs/config_ppc64le.h mlkem/src/cbmc.h mlkem/src/common.h mlkem/src/compress.c mlkem/src/compress.h mlkem/src/debug.c mlkem/src/debug.h mlkem/src/indcpa.c mlkem/src/indcpa.h mlkem/src/kem.c mlkem/src/kem.h mlkem/src/native/api.h mlkem/src/native/meta.h mlkem/src/native/aarch64 mlkem/src/params.h mlkem/src/poly.c mlkem/src/poly.h mlkem/src/poly_k.c mlkem/src/poly_k.h mlkem/src/randombytes.h mlkem/src/sampling.c mlkem/src/sampling.h mlkem/src/symmetric.h mlkem/src/sys.h mlkem/src/verify.c mlkem/src/verify.h mlkem/src/zetas.inc + supported_platforms: + - architecture: ppc64le + operating_systems: + - Linux diff --git a/integration/liboqs/ML-KEM-512_META.yml b/integration/liboqs/ML-KEM-512_META.yml index aa88537d3..9d2c7633a 100644 --- a/integration/liboqs/ML-KEM-512_META.yml +++ b/integration/liboqs/ML-KEM-512_META.yml @@ -9,83 +9,74 @@ length-ciphertext: 768 length-secret-key: 1632 length-shared-secret: 32 length-keypair-seed: 64 -length-encaps-seed: 32 nistkat-sha256: c70041a761e01cd6426fa60e9fd6a4412c2be817386c8d0f3334898082512782 testvectors-sha256: 6730bb552c22d9d2176ffb5568e48eb30952cf1f065073ec5f9724f6a3c6ea85 principal-submitters: -- Peter Schwabe + - Peter Schwabe auxiliary-submitters: -- Roberto Avanzi -- Joppe Bos -- Léo Ducas -- Eike Kiltz -- Tancrède Lepoint -- Vadim Lyubashevsky -- John M. Schanck -- Gregor Seiler -- Damien Stehlé + - Roberto Avanzi + - Joppe Bos + - Léo Ducas + - Eike Kiltz + - Tancrède Lepoint + - Vadim Lyubashevsky + - John M. Schanck + - Gregor Seiler + - Damien Stehlé implementations: -- name: ref - version: FIPS203 - folder_name: . - compile_opts: -DMLK_CONFIG_PARAMETER_SET=512 -DMLK_CONFIG_FILE="../../integration/liboqs/config_c.h" - signature_keypair: PQCP_MLKEM_NATIVE_MLKEM512_C_keypair - signature_keypair_derand: PQCP_MLKEM_NATIVE_MLKEM512_C_keypair_derand - signature_enc: PQCP_MLKEM_NATIVE_MLKEM512_C_enc - signature_enc_derand: PQCP_MLKEM_NATIVE_MLKEM512_C_enc_derand - signature_dec: PQCP_MLKEM_NATIVE_MLKEM512_C_dec - sources: integration/liboqs/config_c.h integration/liboqs/fips202_glue.h integration/liboqs/fips202x4_glue.h - mlkem/src/cbmc.h mlkem/src/common.h mlkem/src/compress.c mlkem/src/compress.h - mlkem/src/debug.c mlkem/src/debug.h mlkem/src/indcpa.c mlkem/src/indcpa.h mlkem/src/kem.c - mlkem/src/kem.h mlkem/src/native/api.h mlkem/src/native/meta.h mlkem/src/params.h - mlkem/src/poly.c mlkem/src/poly.h mlkem/src/poly_k.c mlkem/src/poly_k.h mlkem/src/randombytes.h - mlkem/src/sampling.c mlkem/src/sampling.h mlkem/src/symmetric.h mlkem/src/sys.h - mlkem/src/verify.c mlkem/src/verify.h mlkem/src/zetas.inc -- name: x86_64 - version: FIPS203 - folder_name: . - compile_opts: -DMLK_CONFIG_PARAMETER_SET=512 -DMLK_CONFIG_FILE="../../integration/liboqs/config_x86_64.h" - signature_keypair: PQCP_MLKEM_NATIVE_MLKEM512_X86_64_keypair - signature_keypair_derand: PQCP_MLKEM_NATIVE_MLKEM512_X86_64_keypair_derand - signature_enc: PQCP_MLKEM_NATIVE_MLKEM512_X86_64_enc - signature_enc_derand: PQCP_MLKEM_NATIVE_MLKEM512_X86_64_enc_derand - signature_dec: PQCP_MLKEM_NATIVE_MLKEM512_X86_64_dec - sources: integration/liboqs/config_x86_64.h integration/liboqs/fips202_glue.h integration/liboqs/fips202x4_glue.h - mlkem/src/cbmc.h mlkem/src/common.h mlkem/src/compress.c mlkem/src/compress.h - mlkem/src/debug.c mlkem/src/debug.h mlkem/src/indcpa.c mlkem/src/indcpa.h mlkem/src/kem.c - mlkem/src/kem.h mlkem/src/native/api.h mlkem/src/native/meta.h mlkem/src/params.h - mlkem/src/poly.c mlkem/src/poly.h mlkem/src/poly_k.c mlkem/src/poly_k.h mlkem/src/randombytes.h - mlkem/src/sampling.c mlkem/src/sampling.h mlkem/src/symmetric.h mlkem/src/sys.h - mlkem/src/verify.c mlkem/src/verify.h mlkem/src/zetas.inc mlkem/src/native/x86_64 - supported_platforms: - - architecture: x86_64 - operating_systems: - - Linux - - Darwin - required_flags: - - avx2 - - bmi2 - - popcnt -- name: aarch64 - version: FIPS203 - folder_name: . - compile_opts: -DMLK_CONFIG_PARAMETER_SET=512 -DMLK_CONFIG_FILE="../../integration/liboqs/config_aarch64.h" - signature_keypair: PQCP_MLKEM_NATIVE_MLKEM512_AARCH64_keypair - signature_keypair_derand: PQCP_MLKEM_NATIVE_MLKEM512_AARCH64_keypair_derand - signature_enc: PQCP_MLKEM_NATIVE_MLKEM512_AARCH64_enc - signature_enc_derand: PQCP_MLKEM_NATIVE_MLKEM512_AARCH64_enc_derand - signature_dec: PQCP_MLKEM_NATIVE_MLKEM512_AARCH64_dec - sources: integration/liboqs/config_aarch64.h integration/liboqs/fips202_glue.h integration/liboqs/fips202x4_glue.h - mlkem/src/cbmc.h mlkem/src/common.h mlkem/src/compress.c mlkem/src/compress.h - mlkem/src/debug.c mlkem/src/debug.h mlkem/src/indcpa.c mlkem/src/indcpa.h mlkem/src/kem.c - mlkem/src/kem.h mlkem/src/native/api.h mlkem/src/native/meta.h mlkem/src/params.h - mlkem/src/poly.c mlkem/src/poly.h mlkem/src/poly_k.c mlkem/src/poly_k.h mlkem/src/randombytes.h - mlkem/src/sampling.c mlkem/src/sampling.h mlkem/src/symmetric.h mlkem/src/sys.h - mlkem/src/verify.c mlkem/src/verify.h mlkem/src/zetas.inc mlkem/src/native/aarch64 - supported_platforms: - - architecture: arm_8 - operating_systems: - - Linux - - Darwin - required_flags: - - asimd + - name: ref + version: FIPS203 + folder_name: . + compile_opts: -DMLK_CONFIG_PARAMETER_SET=512 -DMLK_CONFIG_FILE="../../integration/liboqs/config_c.h" + signature_keypair: PQCP_MLKEM_NATIVE_MLKEM512_C_keypair + signature_keypair_derand: PQCP_MLKEM_NATIVE_MLKEM512_C_keypair_derand + signature_enc: PQCP_MLKEM_NATIVE_MLKEM512_C_enc + signature_dec: PQCP_MLKEM_NATIVE_MLKEM512_C_dec + sources: integration/liboqs/config_c.h integration/liboqs/fips202_glue.h integration/liboqs/fips202x4_glue.h mlkem/src/cbmc.h mlkem/src/common.h mlkem/src/compress.c mlkem/src/compress.h mlkem/src/debug.c mlkem/src/debug.h mlkem/src/indcpa.c mlkem/src/indcpa.h mlkem/src/kem.c mlkem/src/kem.h mlkem/src/native/api.h mlkem/src/native/meta.h mlkem/src/params.h mlkem/src/poly.c mlkem/src/poly.h mlkem/src/poly_k.c mlkem/src/poly_k.h mlkem/src/randombytes.h mlkem/src/sampling.c mlkem/src/sampling.h mlkem/src/symmetric.h mlkem/src/sys.h mlkem/src/verify.c mlkem/src/verify.h mlkem/src/zetas.inc + - name: x86_64 + version: FIPS203 + folder_name: . + compile_opts: -DMLK_CONFIG_PARAMETER_SET=512 -DMLK_CONFIG_FILE="../../integration/liboqs/config_x86_64.h" + signature_keypair: PQCP_MLKEM_NATIVE_MLKEM512_X86_64_keypair + signature_keypair_derand: PQCP_MLKEM_NATIVE_MLKEM512_X86_64_keypair_derand + signature_enc: PQCP_MLKEM_NATIVE_MLKEM512_X86_64_enc + signature_dec: PQCP_MLKEM_NATIVE_MLKEM512_X86_64_dec + sources: integration/liboqs/config_x86_64.h integration/liboqs/fips202_glue.h integration/liboqs/fips202x4_glue.h mlkem/src/cbmc.h mlkem/src/common.h mlkem/src/compress.c mlkem/src/compress.h mlkem/src/debug.c mlkem/src/debug.h mlkem/src/indcpa.c mlkem/src/indcpa.h mlkem/src/kem.c mlkem/src/kem.h mlkem/src/native/api.h mlkem/src/native/meta.h mlkem/src/native/x86_64 mlkem/src/params.h mlkem/src/poly.c mlkem/src/poly.h mlkem/src/poly_k.c mlkem/src/poly_k.h mlkem/src/randombytes.h mlkem/src/sampling.c mlkem/src/sampling.h mlkem/src/symmetric.h mlkem/src/sys.h mlkem/src/verify.c mlkem/src/verify.h mlkem/src/zetas.inc + supported_platforms: + - architecture: x86_64 + operating_systems: + - Linux + - Darwin + required_flags: + - avx2 + - bmi2 + - popcnt + - name: aarch64 + version: FIPS203 + folder_name: . + compile_opts: -DMLK_CONFIG_PARAMETER_SET=512 -DMLK_CONFIG_FILE="../../integration/liboqs/config_aarch64.h" + signature_keypair: PQCP_MLKEM_NATIVE_MLKEM512_AARCH64_keypair + signature_keypair_derand: PQCP_MLKEM_NATIVE_MLKEM512_AARCH64_keypair_derand + signature_enc: PQCP_MLKEM_NATIVE_MLKEM512_AARCH64_enc + signature_dec: PQCP_MLKEM_NATIVE_MLKEM512_AARCH64_dec + sources: integration/liboqs/config_aarch64.h integration/liboqs/fips202_glue.h integration/liboqs/fips202x4_glue.h mlkem/src/cbmc.h mlkem/src/common.h mlkem/src/compress.c mlkem/src/compress.h mlkem/src/debug.c mlkem/src/debug.h mlkem/src/indcpa.c mlkem/src/indcpa.h mlkem/src/kem.c mlkem/src/kem.h mlkem/src/native/api.h mlkem/src/native/meta.h mlkem/src/native/aarch64 mlkem/src/params.h mlkem/src/poly.c mlkem/src/poly.h mlkem/src/poly_k.c mlkem/src/poly_k.h mlkem/src/randombytes.h mlkem/src/sampling.c mlkem/src/sampling.h mlkem/src/symmetric.h mlkem/src/sys.h mlkem/src/verify.c mlkem/src/verify.h mlkem/src/zetas.inc + supported_platforms: + - architecture: arm_8 + operating_systems: + - Linux + - Darwin + required_flags: + - asimd + - name: ppc64le + version: FIPS203 + folder_name: . + compile_opts: -DMLK_CONFIG_PARAMETER_SET=512 -DMLK_CONFIG_FILE="../../integration/liboqs/config_ppc64le.h" + signature_keypair: PQCP_MLKEM_NATIVE_MLKEM512_PPC64LE_keypair + signature_keypair_derand: PQCP_MLKEM_NATIVE_MLKEM512_PPC64LE_keypair_derand + signature_enc: PQCP_MLKEM_NATIVE_MLKEM512_PPC64LE_enc + signature_dec: PQCP_MLKEM_NATIVE_MLKEM512_PPC64LE_dec + sources: integration/liboqs/config_ppc64le.h mlkem/src/cbmc.h mlkem/src/common.h mlkem/src/compress.c mlkem/src/compress.h mlkem/src/debug.c mlkem/src/debug.h mlkem/src/indcpa.c mlkem/src/indcpa.h mlkem/src/kem.c mlkem/src/kem.h mlkem/src/native/api.h mlkem/src/native/meta.h mlkem/src/native/aarch64 mlkem/src/params.h mlkem/src/poly.c mlkem/src/poly.h mlkem/src/poly_k.c mlkem/src/poly_k.h mlkem/src/randombytes.h mlkem/src/sampling.c mlkem/src/sampling.h mlkem/src/symmetric.h mlkem/src/sys.h mlkem/src/verify.c mlkem/src/verify.h mlkem/src/zetas.inc + supported_platforms: + - architecture: ppc64le + operating_systems: + - Linux diff --git a/integration/liboqs/ML-KEM-768_META.yml b/integration/liboqs/ML-KEM-768_META.yml index 254d67478..e230f3ba6 100644 --- a/integration/liboqs/ML-KEM-768_META.yml +++ b/integration/liboqs/ML-KEM-768_META.yml @@ -9,83 +9,74 @@ length-ciphertext: 1088 length-secret-key: 2400 length-shared-secret: 32 length-keypair-seed: 64 -length-encaps-seed: 32 nistkat-sha256: 5352539586b6c3df58be6158a6250aeff402bd73060b0a3de68850ac074c17c3 testvectors-sha256: 667c8ca2ca93729c0df6ff24588460bad1bbdbfb64ece0fe8563852a7ff348c6 principal-submitters: -- Peter Schwabe + - Peter Schwabe auxiliary-submitters: -- Roberto Avanzi -- Joppe Bos -- Léo Ducas -- Eike Kiltz -- Tancrède Lepoint -- Vadim Lyubashevsky -- John M. Schanck -- Gregor Seiler -- Damien Stehlé + - Roberto Avanzi + - Joppe Bos + - Léo Ducas + - Eike Kiltz + - Tancrède Lepoint + - Vadim Lyubashevsky + - John M. Schanck + - Gregor Seiler + - Damien Stehlé implementations: -- name: ref - version: FIPS203 - folder_name: . - compile_opts: -DMLK_CONFIG_PARAMETER_SET=768 -DMLK_CONFIG_FILE="../../integration/liboqs/config_c.h" - signature_keypair: PQCP_MLKEM_NATIVE_MLKEM768_C_keypair - signature_keypair_derand: PQCP_MLKEM_NATIVE_MLKEM768_C_keypair_derand - signature_enc: PQCP_MLKEM_NATIVE_MLKEM768_C_enc - signature_enc_derand: PQCP_MLKEM_NATIVE_MLKEM768_C_enc_derand - signature_dec: PQCP_MLKEM_NATIVE_MLKEM768_C_dec - sources: integration/liboqs/config_c.h integration/liboqs/fips202_glue.h integration/liboqs/fips202x4_glue.h - mlkem/src/cbmc.h mlkem/src/common.h mlkem/src/compress.c mlkem/src/compress.h - mlkem/src/debug.c mlkem/src/debug.h mlkem/src/indcpa.c mlkem/src/indcpa.h mlkem/src/kem.c - mlkem/src/kem.h mlkem/src/native/api.h mlkem/src/native/meta.h mlkem/src/params.h - mlkem/src/poly.c mlkem/src/poly.h mlkem/src/poly_k.c mlkem/src/poly_k.h mlkem/src/randombytes.h - mlkem/src/sampling.c mlkem/src/sampling.h mlkem/src/symmetric.h mlkem/src/sys.h - mlkem/src/verify.c mlkem/src/verify.h mlkem/src/zetas.inc -- name: x86_64 - version: FIPS203 - folder_name: . - compile_opts: -DMLK_CONFIG_PARAMETER_SET=768 -DMLK_CONFIG_FILE="../../integration/liboqs/config_x86_64.h" - signature_keypair: PQCP_MLKEM_NATIVE_MLKEM768_X86_64_keypair - signature_keypair_derand: PQCP_MLKEM_NATIVE_MLKEM768_X86_64_keypair_derand - signature_enc: PQCP_MLKEM_NATIVE_MLKEM768_X86_64_enc - signature_enc_derand: PQCP_MLKEM_NATIVE_MLKEM768_X86_64_enc_derand - signature_dec: PQCP_MLKEM_NATIVE_MLKEM768_X86_64_dec - sources: integration/liboqs/config_x86_64.h integration/liboqs/fips202_glue.h integration/liboqs/fips202x4_glue.h - mlkem/src/cbmc.h mlkem/src/common.h mlkem/src/compress.c mlkem/src/compress.h - mlkem/src/debug.c mlkem/src/debug.h mlkem/src/indcpa.c mlkem/src/indcpa.h mlkem/src/kem.c - mlkem/src/kem.h mlkem/src/native/api.h mlkem/src/native/meta.h mlkem/src/params.h - mlkem/src/poly.c mlkem/src/poly.h mlkem/src/poly_k.c mlkem/src/poly_k.h mlkem/src/randombytes.h - mlkem/src/sampling.c mlkem/src/sampling.h mlkem/src/symmetric.h mlkem/src/sys.h - mlkem/src/verify.c mlkem/src/verify.h mlkem/src/zetas.inc mlkem/src/native/x86_64 - supported_platforms: - - architecture: x86_64 - operating_systems: - - Linux - - Darwin - required_flags: - - avx2 - - bmi2 - - popcnt -- name: aarch64 - version: FIPS203 - folder_name: . - compile_opts: -DMLK_CONFIG_PARAMETER_SET=768 -DMLK_CONFIG_FILE="../../integration/liboqs/config_aarch64.h" - signature_keypair: PQCP_MLKEM_NATIVE_MLKEM768_AARCH64_keypair - signature_keypair_derand: PQCP_MLKEM_NATIVE_MLKEM768_AARCH64_keypair_derand - signature_enc: PQCP_MLKEM_NATIVE_MLKEM768_AARCH64_enc - signature_enc_derand: PQCP_MLKEM_NATIVE_MLKEM768_AARCH64_enc_derand - signature_dec: PQCP_MLKEM_NATIVE_MLKEM768_AARCH64_dec - sources: integration/liboqs/config_aarch64.h integration/liboqs/fips202_glue.h integration/liboqs/fips202x4_glue.h - mlkem/src/cbmc.h mlkem/src/common.h mlkem/src/compress.c mlkem/src/compress.h - mlkem/src/debug.c mlkem/src/debug.h mlkem/src/indcpa.c mlkem/src/indcpa.h mlkem/src/kem.c - mlkem/src/kem.h mlkem/src/native/api.h mlkem/src/native/meta.h mlkem/src/params.h - mlkem/src/poly.c mlkem/src/poly.h mlkem/src/poly_k.c mlkem/src/poly_k.h mlkem/src/randombytes.h - mlkem/src/sampling.c mlkem/src/sampling.h mlkem/src/symmetric.h mlkem/src/sys.h - mlkem/src/verify.c mlkem/src/verify.h mlkem/src/zetas.inc mlkem/src/native/aarch64 - supported_platforms: - - architecture: arm_8 - operating_systems: - - Linux - - Darwin - required_flags: - - asimd + - name: ref + version: FIPS203 + folder_name: . + compile_opts: -DMLK_CONFIG_PARAMETER_SET=768 -DMLK_CONFIG_FILE="../../integration/liboqs/config_c.h" + signature_keypair: PQCP_MLKEM_NATIVE_MLKEM768_C_keypair + signature_keypair_derand: PQCP_MLKEM_NATIVE_MLKEM768_C_keypair_derand + signature_enc: PQCP_MLKEM_NATIVE_MLKEM768_C_enc + signature_dec: PQCP_MLKEM_NATIVE_MLKEM768_C_dec + sources: integration/liboqs/config_c.h integration/liboqs/fips202_glue.h integration/liboqs/fips202x4_glue.h mlkem/src/cbmc.h mlkem/src/common.h mlkem/src/compress.c mlkem/src/compress.h mlkem/src/debug.c mlkem/src/debug.h mlkem/src/indcpa.c mlkem/src/indcpa.h mlkem/src/kem.c mlkem/src/kem.h mlkem/src/native/api.h mlkem/src/native/meta.h mlkem/src/params.h mlkem/src/poly.c mlkem/src/poly.h mlkem/src/poly_k.c mlkem/src/poly_k.h mlkem/src/randombytes.h mlkem/src/sampling.c mlkem/src/sampling.h mlkem/src/symmetric.h mlkem/src/sys.h mlkem/src/verify.c mlkem/src/verify.h mlkem/src/zetas.inc + - name: x86_64 + version: FIPS203 + folder_name: . + compile_opts: -DMLK_CONFIG_PARAMETER_SET=768 -DMLK_CONFIG_FILE="../../integration/liboqs/config_x86_64.h" + signature_keypair: PQCP_MLKEM_NATIVE_MLKEM768_X86_64_keypair + signature_keypair_derand: PQCP_MLKEM_NATIVE_MLKEM768_X86_64_keypair_derand + signature_enc: PQCP_MLKEM_NATIVE_MLKEM768_X86_64_enc + signature_dec: PQCP_MLKEM_NATIVE_MLKEM768_X86_64_dec + sources: integration/liboqs/config_x86_64.h integration/liboqs/fips202_glue.h integration/liboqs/fips202x4_glue.h mlkem/src/cbmc.h mlkem/src/common.h mlkem/src/compress.c mlkem/src/compress.h mlkem/src/debug.c mlkem/src/debug.h mlkem/src/indcpa.c mlkem/src/indcpa.h mlkem/src/kem.c mlkem/src/kem.h mlkem/src/native/api.h mlkem/src/native/meta.h mlkem/src/native/x86_64 mlkem/src/params.h mlkem/src/poly.c mlkem/src/poly.h mlkem/src/poly_k.c mlkem/src/poly_k.h mlkem/src/randombytes.h mlkem/src/sampling.c mlkem/src/sampling.h mlkem/src/symmetric.h mlkem/src/sys.h mlkem/src/verify.c mlkem/src/verify.h mlkem/src/zetas.inc + supported_platforms: + - architecture: x86_64 + operating_systems: + - Linux + - Darwin + required_flags: + - avx2 + - bmi2 + - popcnt + - name: aarch64 + version: FIPS203 + folder_name: . + compile_opts: -DMLK_CONFIG_PARAMETER_SET=768 -DMLK_CONFIG_FILE="../../integration/liboqs/config_aarch64.h" + signature_keypair: PQCP_MLKEM_NATIVE_MLKEM768_AARCH64_keypair + signature_keypair_derand: PQCP_MLKEM_NATIVE_MLKEM768_AARCH64_keypair_derand + signature_enc: PQCP_MLKEM_NATIVE_MLKEM768_AARCH64_enc + signature_dec: PQCP_MLKEM_NATIVE_MLKEM768_AARCH64_dec + sources: integration/liboqs/config_aarch64.h integration/liboqs/fips202_glue.h integration/liboqs/fips202x4_glue.h mlkem/src/cbmc.h mlkem/src/common.h mlkem/src/compress.c mlkem/src/compress.h mlkem/src/debug.c mlkem/src/debug.h mlkem/src/indcpa.c mlkem/src/indcpa.h mlkem/src/kem.c mlkem/src/kem.h mlkem/src/native/api.h mlkem/src/native/meta.h mlkem/src/native/aarch64 mlkem/src/params.h mlkem/src/poly.c mlkem/src/poly.h mlkem/src/poly_k.c mlkem/src/poly_k.h mlkem/src/randombytes.h mlkem/src/sampling.c mlkem/src/sampling.h mlkem/src/symmetric.h mlkem/src/sys.h mlkem/src/verify.c mlkem/src/verify.h mlkem/src/zetas.inc + supported_platforms: + - architecture: arm_8 + operating_systems: + - Linux + - Darwin + required_flags: + - asimd + - name: ppc64le + version: FIPS203 + folder_name: . + compile_opts: -DMLK_CONFIG_PARAMETER_SET=768 -DMLK_CONFIG_FILE="....//integration/liboqs/config_ppc64le.h" + signature_keypair: PQCP_MLKEM_NATIVE_MLKEM768_PPC64LE_keypair + signature_keypair_derand: PQCP_MLKEM_NATIVE_MLKEM768_PPC64LE_keypair_derand + signature_enc: PQCP_MLKEM_NATIVE_MLKEM768_PPC64LE_enc + signature_dec: PQCP_MLKEM_NATIVE_MLKEM768_PPC64LE_dec + sources: integration/liboqs/config_ppc64le.h mlkem/src/cbmc.h mlkem/src/common.h mlkem/src/compress.c mlkem/src/compress.h mlkem/src/debug.c mlkem/src/debug.h mlkem/src/indcpa.c mlkem/src/indcpa.h mlkem/src/kem.c mlkem/src/kem.h mlkem/src/native/api.h mlkem/src/native/meta.h mlkem/src/native/aarch64 mlkem/src/params.h mlkem/src/poly.c mlkem/src/poly.h mlkem/src/poly_k.c mlkem/src/poly_k.h mlkem/src/randombytes.h mlkem/src/sampling.c mlkem/src/sampling.h mlkem/src/symmetric.h mlkem/src/sys.h mlkem/src/verify.c mlkem/src/verify.h mlkem/src/zetas.inc + supported_platforms: + - architecture: ppc64le + operating_systems: + - Linux diff --git a/integration/liboqs/config_ppc64le.h b/integration/liboqs/config_ppc64le.h new file mode 100644 index 000000000..2fa1cdbcf --- /dev/null +++ b/integration/liboqs/config_ppc64le.h @@ -0,0 +1,266 @@ +/* + * Copyright (c) The mlkem-native project authors + * SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT + */ + +/* References + * ========== + * + * - [FIPS140_3_IG] + * Implementation Guidance for FIPS 140-3 and the Cryptographic Module + * Validation Program National Institute of Standards and Technology + * https://csrc.nist.gov/projects/cryptographic-module-validation-program/fips-140-3-ig-announcements + */ + +#ifndef MLK_INTEGRATION_LIBOQS_CONFIG_PPC64LE_H +#define MLK_INTEGRATION_LIBOQS_CONFIG_PPC64LE_H + +/****************************************************************************** + * Name: MLK_CONFIG_PARAMETER_SET + * + * Description: Specifies the parameter set for ML-KEM + * - MLK_CONFIG_PARAMETER_SET=512 corresponds to ML-KEM-512 + * - MLK_CONFIG_PARAMETER_SET=768 corresponds to ML-KEM-768 + * - MLK_CONFIG_PARAMETER_SET=1024 corresponds to ML-KEM-1024 + * + * This can also be set using CFLAGS. + * + *****************************************************************************/ +#ifndef MLK_CONFIG_PARAMETER_SET +#define MLK_CONFIG_PARAMETER_SET \ + 768 /* Change this for different security strengths */ +#endif + +/****************************************************************************** + * Name: MLK_CONFIG_NAMESPACE_PREFIX + * + * Description: The prefix to use to namespace global symbols from mlkem/. + * + * In a multi-level build (that is, if either + * - MLK_CONFIG_MULTILEVEL_WITH_SHARED, or + * - MLK_CONFIG_MULTILEVEL_NO_SHARED, + * are set, level-dependent symbols will additionally be prefixed + * with the parameter set (512/768/1024). + * + * This can also be set using CFLAGS. + * + *****************************************************************************/ +#if MLK_CONFIG_PARAMETER_SET == 512 +#define MLK_CONFIG_NAMESPACE_PREFIX PQCP_MLKEM_NATIVE_MLKEM512_PPC64LE +#elif MLK_CONFIG_PARAMETER_SET == 768 +#define MLK_CONFIG_NAMESPACE_PREFIX PQCP_MLKEM_NATIVE_MLKEM768_PPC64LE +#elif MLK_CONFIG_PARAMETER_SET == 1024 +#define MLK_CONFIG_NAMESPACE_PREFIX PQCP_MLKEM_NATIVE_MLKEM1024_PPC64LE +#endif + +/****************************************************************************** + * Name: MLK_CONFIG_USE_NATIVE_BACKEND_ARITH + * + * Description: Determines whether an native arithmetic backend should be used. + * + * The arithmetic backend covers performance critical functions + * such as the number-theoretic transform (NTT). + * + * If this option is unset, the C backend will be used. + * + * If this option is set, the arithmetic backend to be use is + * determined by MLK_CONFIG_ARITH_BACKEND_FILE: If the latter is + * unset, the default backend for your the target architecture + * will be used. If set, it must be the name of a backend metadata + * file. + * + * This can also be set using CFLAGS. + * + *****************************************************************************/ +#define MLK_CONFIG_USE_NATIVE_BACKEND_ARITH + +/****************************************************************************** + * Name: MLK_CONFIG_ARITH_BACKEND_FILE + * + * Description: The arithmetic backend to use. + * + * If MLK_CONFIG_USE_NATIVE_BACKEND_ARITH is unset, this option + * is ignored. + * + * If MLK_CONFIG_USE_NATIVE_BACKEND_ARITH is set, this option must + * either be undefined or the filename of an arithmetic backend. + * If unset, the default backend will be used. + * + * This can be set using CFLAGS. + * + *****************************************************************************/ +#define MLK_CONFIG_ARITH_BACKEND_FILE "native/meta.h" + +/****************************************************************************** + * Name: MLK_CONFIG_FIPS202_CUSTOM_HEADER + * + * Description: Custom header to use for FIPS-202 + * + * This should only be set if you intend to use a custom + * FIPS-202 implementation, different from the one shipped + * with mlkem-native. + * + * If set, it must be the name of a file serving as the + * replacement for mlkem/fips202/fips202.h, and exposing + * the same API (see FIPS202.md). + * + *****************************************************************************/ +/* +#define MLK_CONFIG_FIPS202_CUSTOM_HEADER \ + "../../integration/liboqs/fips202_glue.h" +*/ + +/****************************************************************************** + * Name: MLK_CONFIG_FIPS202X4_CUSTOM_HEADER + * + * Description: Custom header to use for FIPS-202-X4 + * + * This should only be set if you intend to use a custom + * FIPS-202 implementation, different from the one shipped + * with mlkem-native. + * + * If set, it must be the name of a file serving as the + * replacement for mlkem/fips202/fips202x4.h, and exposing + * the same API (see FIPS202.md). + * + *****************************************************************************/ +/* +#define MLK_CONFIG_FIPS202X4_CUSTOM_HEADER \ + "../../integration/liboqs/fips202x4_glue.h" +*/ + +/****************************************************************************** + * Name: MLK_CONFIG_CUSTOM_ZEROIZE + * + * Description: In compliance with FIPS 203 Section 3.3, mlkem-native zeroizes + * intermediate stack buffers before returning from function calls. + * + * Set this option and define `mlk_zeroize` if you want to + * use a custom method to zeroize intermediate stack buffers. + * The default implementation uses SecureZeroMemory on Windows + * and a memset + compiler barrier otherwise. If neither of those + * is available on the target platform, compilation will fail, + * and you will need to use MLK_CONFIG_CUSTOM_ZEROIZE to provide + * a custom implementation of `mlk_zeroize()`. + * + * WARNING: + * The explicit stack zeroization conducted by mlkem-native + * reduces the likelihood of data leaking on the stack, but + * does not eliminate it! The C standard makes no guarantee about + * where a compiler allocates structures and whether/where it makes + * copies of them. Also, in addition to entire structures, there + * may also be potentially exploitable leakage of individual values + * on the stack. + * + * If you need bullet-proof zeroization of the stack, you need to + * consider additional measures instead of of what this feature + * provides. In this case, you can set mlk_zeroize to a no-op. + * + *****************************************************************************/ +/* #define MLK_CONFIG_CUSTOM_ZEROIZE + #if !defined(__ASSEMBLER__) + #include + #include "sys.h" + static MLK_INLINE void mlk_zeroize(void *ptr, size_t len) + { + ... your implementation ... + } + #endif +*/ + +/****************************************************************************** + * Name: MLK_CONFIG_CUSTOM_RANDOMBYTES + * + * Description: mlkem-native does not provide a secure randombytes + * implementation. Such an implementation has to provided by the + * consumer. + * + * If this option is not set, mlkem-native expects a function + * void randombytes(uint8_t *out, size_t outlen). + * + * Set this option and define `mlk_randombytes` if you want to + * use a custom method to sample randombytes with a different name + * or signature. + * + *****************************************************************************/ +#define MLK_CONFIG_CUSTOM_RANDOMBYTES +#if !defined(__ASSEMBLER__) +#include +#include +#include "../../mlkem/src/sys.h" +static MLK_INLINE void mlk_randombytes(uint8_t *ptr, size_t len) +{ + OQS_randombytes(ptr, len); +} +#endif /* !__ASSEMBLER__ */ + +/****************************************************************************** + * Name: MLK_CONFIG_NO_ASM + * + * Description: If this option is set, mlkem-native will be built without + * use of native code or inline assembly. + * + * By default, inline assembly is used to implement value barriers. + * Without inline assembly, mlkem-native will use a global volatile + * 'opt blocker' instead; see verify.h. + * + * Inline assembly is also used to implement a secure zeroization + * function on non-Windows platforms. If this option is set and + * the target platform is not Windows, you MUST set + * MLK_CONFIG_CUSTOM_ZEROIZE and provide a custom zeroization + * function. + * + * If this option is set, MLK_CONFIG_USE_NATIVE_BACKEND_FIPS202 and + * and MLK_CONFIG_USE_NATIVE_BACKEND_ARITH will be ignored, and no + *native backends will be used. + * + *****************************************************************************/ +/* #define MLK_CONFIG_NO_ASM */ + +/****************************************************************************** + * Name: MLK_CONFIG_KEYGEN_PCT + * + * Description: Compliance with @[FIPS140_3_IG, p.87] requires a + * Pairwise Consistency Test (PCT) to be carried out on a freshly + * generated keypair before it can be exported. + * + * Set this option if such a check should be implemented. + * In this case, crypto_kem_keypair_derand and crypto_kem_keypair + * will return a non-zero error code if the PCT failed. + * + * NOTE: This feature will drastically lower the performance of + * key generation. + * + *****************************************************************************/ +/* #define MLK_CONFIG_KEYGEN_PCT */ + +/****************************************************************************** + * Name: MLK_CONFIG_KEYGEN_PCT_BREAKAGE_TEST + * + * Description: If this option is set, the user must provide a runtime + * function `static inline int mlk_break_pct() { ... }` to + * indicate whether the PCT should be made fail. + * + * This option only has an effect if MLK_CONFIG_KEYGEN_PCT is set. + * + *****************************************************************************/ +/* #define MLK_CONFIG_KEYGEN_PCT_BREAKAGE_TEST + #if !defined(__ASSEMBLER__) + #include "sys.h" + static MLK_INLINE int mlk_break_pct(void) + { + ... return 0/1 depending on whether PCT should be broken ... + } + #endif +*/ + +/* Enable valgrind-based assertions in mlkem-native through macro + * from libOQS. */ +#if !defined(__ASSEMBLER__) +#include +#if defined(OQS_ENABLE_TEST_CONSTANT_TIME) +#define MLK_CONFIG_CT_TESTING_ENABLED +#endif +#endif /* !__ASSEMBLER__ */ + +#endif /* !MLK_INTEGRATION_LIBOQS_CONFIG_PPC64LE_H */ diff --git a/mlkem/src/native/meta.h b/mlkem/src/native/meta.h index f2b9b848b..7fdcd6fcf 100644 --- a/mlkem/src/native/meta.h +++ b/mlkem/src/native/meta.h @@ -18,4 +18,8 @@ #include "x86_64/meta.h" #endif +#ifdef MLK_SYS_PPC64LE +#include "ppc64le/meta.h" +#endif /* MLK_SYS_PPC64LE */ + #endif /* !MLK_NATIVE_META_H */ diff --git a/mlkem/src/native/ppc64le/README.md b/mlkem/src/native/ppc64le/README.md new file mode 100644 index 000000000..5125a40ea --- /dev/null +++ b/mlkem/src/native/ppc64le/README.md @@ -0,0 +1,6 @@ +[//]: # (SPDX-License-Identifier: CC-BY-4.0) + +# ppc64le backend (little endian) + +This directory contains a native backend for little endian POWER 8 (ppc64le) and above systems. + diff --git a/mlkem/src/native/ppc64le/meta.h b/mlkem/src/native/ppc64le/meta.h new file mode 100644 index 000000000..bee788976 --- /dev/null +++ b/mlkem/src/native/ppc64le/meta.h @@ -0,0 +1,49 @@ +/* + * Copyright (c) The mlkem-native project authors + * SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT + */ + +#ifndef MLK_NATIVE_PPC64LE_META_H +#define MLK_NATIVE_PPC64LE_META_H + +/* Identifier for this backend so that source and assembly files + * in the build can be appropriately guarded. */ +#define MLK_ARITH_BACKEND_PPC64LE_DEFAULT + +#define MLK_ARITH_BACKEND_NAME PPC64LE_DEFAULT + +/* Set of primitives that this backend replaces */ +#define MLK_USE_NATIVE_NTT +#define MLK_USE_NATIVE_INTT +#define MLK_USE_NATIVE_POLY_REDUCE +#define MLK_USE_NATIVE_POLY_TOMONT + +#if !defined(__ASSEMBLER__) +#include +#include "../../common.h" +#include "../../params.h" +#include "../api.h" +#include "src/arith_native_ppc64le.h" + +static MLK_INLINE int mlk_ntt_native(int16_t data[MLKEM_N]) { + mlk_ntt_ppc(data); + return MLK_NATIVE_FUNC_SUCCESS; +} + +static MLK_INLINE int mlk_intt_native(int16_t data[MLKEM_N]) { + mlk_intt_ppc(data); + return MLK_NATIVE_FUNC_SUCCESS; +} + +static MLK_INLINE int mlk_poly_reduce_native(int16_t data[MLKEM_N]) { + mlk_reduce_ppc(data); + return MLK_NATIVE_FUNC_SUCCESS; +} + +static MLK_INLINE int mlk_poly_tomont_native(int16_t data[MLKEM_N]) { + mlk_poly_tomont_ppc(data); + return MLK_NATIVE_FUNC_SUCCESS; +} +#endif /* !__ASSEMBLER__ */ + +#endif /* MLK_NATIVE_PPC64LE_META_H */ diff --git a/mlkem/src/native/ppc64le/src/arith_native_ppc64le.h b/mlkem/src/native/ppc64le/src/arith_native_ppc64le.h new file mode 100644 index 000000000..57f0b8f8c --- /dev/null +++ b/mlkem/src/native/ppc64le/src/arith_native_ppc64le.h @@ -0,0 +1,23 @@ +/* + * Copyright (c) 2024-2025 The mlkem-native project authors + * SPDX-License-Identifier: Apache-2.0 + */ +#ifndef MLK_NATIVE_PPC64LE_SRC_ARITH_NATIVE_PPC64LE_H +#define MLK_NATIVE_PPC64LE_SRC_ARITH_NATIVE_PPC64LE_H + +#include +#include "../../../common.h" + +#define mlk_ntt_ppc MLK_NAMESPACE(ntt_ppc) +void mlk_ntt_ppc(int16_t *); + +#define mlk_intt_ppc MLK_NAMESPACE(intt_ppc) +void mlk_intt_ppc(int16_t *); + +#define mlk_reduce_ppc MLK_NAMESPACE(reduce_ppc) +void mlk_reduce_ppc(int16_t *r); + +#define mlk_poly_tomont_ppc MLK_NAMESPACE(poly_tomont_ppc) +void mlk_poly_tomont_ppc(int16_t *); + +#endif /* MLK_NATIVE_PPC64LE_SRC_ARITH_NATIVE_PPC64LE_H */ diff --git a/mlkem/src/native/ppc64le/src/intt_ppc.S b/mlkem/src/native/ppc64le/src/intt_ppc.S new file mode 100644 index 000000000..feb78b984 --- /dev/null +++ b/mlkem/src/native/ppc64le/src/intt_ppc.S @@ -0,0 +1,773 @@ +/* + * Copyright (c) The mlkem-native project authors + * SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT + */ + +# +# Copyright 2025- IBM Corp. +# +#=================================================================================== +# Written by Danny Tsen +# + +#include "../../../common.h" + +.machine "any" +.text + +# Barrett reduce constatnts +#define V20159 0 +#define V_25 1 +#define V_26 2 +#define V_MKQ 3 + +# Montgomery reduce constatnts +#define V_QINV 2 +#define V_NMKQ 5 +#define V_Z0 7 +#define V_Z1 8 +#define V_Z2 9 +#define V_Z3 10 +#define V_ZETA 10 +#define V1441 10 + +.macro Load_4Coeffs start next step + mr 9, \start # j + add 10, 4, 9 # J + len*2 + addi 16, 9, \next + addi 17, 10, \step + addi 18, 16, \next + addi 19, 17, \step + addi 20, 18, \next + addi 21, 19, \step + lxvd2x 32+8, 3, 10 # r[j+len] + lxvd2x 32+12, 3, 17 # r[j+len] + lxvd2x 32+16, 3, 19 # r[j+len] + lxvd2x 32+20, 3, 21 # r[j+len] + xxpermdi 32+8, 32+8, 32+8, 2 + xxpermdi 32+12, 32+12, 32+12, 2 + xxpermdi 32+16, 32+16, 32+16, 2 + xxpermdi 32+20, 32+20, 32+20, 2 + + lxvd2x 32+21, 3, 9 + lxvd2x 32+22, 3, 16 + lxvd2x 32+23, 3, 18 + lxvd2x 32+24, 3, 20 + xxpermdi 32+21, 32+21, 32+21, 2 + xxpermdi 32+22, 32+22, 32+22, 2 + xxpermdi 32+23, 32+23, 32+23, 2 + xxpermdi 32+24, 32+24, 32+24, 2 + + vsubuhm 25, 8, 21 # r[j+len] - t + vsubuhm 26, 12, 22 # r[j+len] - t + vsubuhm 30, 16, 23 # r[j+len] - t + vsubuhm 31, 20, 24 # r[j+len] - t + vadduhm 8, 8, 21 # r[j+len] + t + vadduhm 12, 12, 22 # r[j+len] + t + vadduhm 16, 16, 23 # r[j+len] + t + vadduhm 20, 20, 24 # r[j+len] + t +.endm + +.macro BREDUCE_4X _v0 _v1 _v2 _v3 + vxor 7, 7, 7 + xxlor 32+3, 6, 6 # V_MKQ + xxlor 32+1, 7, 7 # V_25 + xxlor 32+2, 8, 8 # V_26 + vmulosh 6, 8, V20159 + vmulesh 5, 8, V20159 + vmulosh 11, 12, V20159 + vmulesh 10, 12, V20159 + vmulosh 15, 16, V20159 + vmulesh 14, 16, V20159 + vmulosh 19, 20, V20159 + vmulesh 18, 20, V20159 + xxmrglw 32+4, 32+5, 32+6 + xxmrghw 32+5, 32+5, 32+6 + xxmrglw 32+9, 32+10, 32+11 + xxmrghw 32+10, 32+10, 32+11 + xxmrglw 32+13, 32+14, 32+15 + xxmrghw 32+14, 32+14, 32+15 + xxmrglw 32+17, 32+18, 32+19 + xxmrghw 32+18, 32+18, 32+19 + vadduwm 4, 4, V_25 + vadduwm 5, 5, V_25 + vadduwm 9, 9, V_25 + vadduwm 10, 10, V_25 + vadduwm 13, 13, V_25 + vadduwm 14, 14, V_25 + vadduwm 17, 17, V_25 + vadduwm 18, 18, V_25 + vsraw 4, 4, V_26 + vsraw 5, 5, V_26 + vsraw 9, 9, V_26 + vsraw 10, 10, V_26 + vsraw 13, 13, V_26 + vsraw 14, 14, V_26 + vsraw 17, 17, V_26 + vsraw 18, 18, V_26 + vpkuwum 4, 5, 4 + vsubuhm 4, 7, 4 + vpkuwum 9, 10, 9 + vsubuhm 9, 7, 9 + vpkuwum 13, 14, 13 + vsubuhm 13, 7, 13 + vpkuwum 17, 18, 17 + vsubuhm 17, 7, 17 + vmladduhm \_v0, 4, V_MKQ, 8 + vmladduhm \_v1, 9, V_MKQ, 12 + vmladduhm \_v2, 13, V_MKQ, 16 + vmladduhm \_v3, 17, V_MKQ, 20 +.endm + +#----------------------------------- +# MREDUCE_4X(len, start, _vz0, _vz1, _vz2, _vz3) +# +.macro MREDUCE_4X _vz0 _vz1 _vz2 _vz3 _vo0 _vo1 _vo2 _vo3 + vmladduhm 15, 25, \_vz0, 3 + vmladduhm 20, 26, \_vz1, 3 + vmladduhm 27, 30, \_vz2, 3 + vmladduhm 28, 31, \_vz3, 3 + + vmhraddshs 14, 25, \_vz0, 3 + vmhraddshs 19, 26, \_vz1, 3 + vmhraddshs 24, 30, \_vz2, 3 + vmhraddshs 29, 31, \_vz3, 3 + + vmladduhm 15, 15, V_QINV, 3 + vmladduhm 20, 20, V_QINV, 3 + vmladduhm 25, 27, V_QINV, 3 + vmladduhm 30, 28, V_QINV, 3 + + vmhraddshs 15, 15, V_NMKQ, 14 + vmhraddshs 20, 20, V_NMKQ, 19 + vmhraddshs 25, 25, V_NMKQ, 24 + vmhraddshs 30, 30, V_NMKQ, 29 + + vsrah \_vo0, 15, 4 # >> 1 + vsrah \_vo1, 20, 4 # >> 1 + vsrah \_vo2, 25, 4 # >> 1 + vsrah \_vo3, 30, 4 # >> 1 +.endm + +.macro Set_mont_consts + xxlor 32+5, 0, 0 # V_NMKQ + xxlor 32+2, 2, 2 # V_QINV + xxlor 32+3, 3, 3 # 0 + xxlor 32+4, 4, 4 # 1 +.endm + +.macro Load_next_4zetas + lxv 32+V_Z0, 0(14) + lxv 32+V_Z1, 16(14) + lxv 32+V_Z2, 32(14) + lxv 32+V_Z3, 48(14) + addi 14, 14, 64 +.endm + +.macro Write_B4C _vs0 _vs1 _vs2 _vs3 + stxvx \_vs0, 3, 9 + stxvx \_vs1, 3, 16 + stxvx \_vs2, 3, 18 + stxvx \_vs3, 3, 20 +.endm + +.macro Write_M4C _vs0 _vs1 _vs2 _vs3 + stxvx \_vs0, 3, 10 + stxvx \_vs1, 3, 17 + stxvx \_vs2, 3, 19 + stxvx \_vs3, 3, 21 +.endm + +.macro Reload_4coeffs + lxv 32+25, 0(3) + lxv 32+26, 16(3) + lxv 32+30, 32(3) + lxv 32+31, 48(3) + addi 3, 3, 64 +.endm + +.macro MWrite_8X _vs0 _vs1 _vs2 _vs3 _vs4 _vs5 _vs6 _vs7 + stxv \_vs0, -128(3) + stxv \_vs1, -112(3) + stxv \_vs2, -96(3) + stxv \_vs3, -80(3) + stxv \_vs4, -64(3) + stxv \_vs5, -48(3) + stxv \_vs6, -32(3) + stxv \_vs7, -16(3) +.endm + +.macro Write_Len2_4C _vs0 _vs1 _vs2 _vs3 + xxmrglw 32+12, \_vs0, 10 + xxmrghw 32+11, \_vs0, 10 + xxpermdi 10, 32+12, 32+11, 3 + xxmrglw 32+16, \_vs1, 11 + xxmrghw 32+15, \_vs1, 11 + xxpermdi 11, 32+16, 32+15, 3 + xxmrglw 32+12, \_vs2, 12 + xxmrghw 32+11, \_vs2, 12 + xxpermdi 12, 32+12, 32+11, 3 + xxmrglw 32+16, \_vs3, 13 + xxmrghw 32+15, \_vs3, 13 + xxpermdi 13, 32+16, 32+15, 3 + stxvd2x 10, 3, 9 + stxvd2x 11, 3, 16 + stxvd2x 12, 3, 18 + stxvd2x 13, 3, 20 +.endm + +.macro Write_Len4_4C _vs0 _vs1 _vs2 _vs3 + xxpermdi 10, 10, \_vs0, 3 + xxpermdi 11, 11, \_vs1, 3 + xxpermdi 12, 12, \_vs2, 3 + xxpermdi 13, 13, \_vs3, 3 + stxvd2x 10, 3, 9 + stxvd2x 11, 3, 16 + stxvd2x 12, 3, 18 + stxvd2x 13, 3, 20 +.endm + +# intt +# t = r[j]; +# r[j] = barrett_reduce(t + r[j + len]); +# r[j + len] = r[j + len] - t; +# r[j + len] = fqmul(zeta, r[j + len]); + +# +# mlk_intt_ppc(r) +# +.global MLK_ASM_NAMESPACE(intt_ppc) +.align 4 +MLK_ASM_FN_SYMBOL(intt_ppc) + + stdu 1, -352(1) + mflr 0 + std 14, 56(1) + std 15, 64(1) + std 16, 72(1) + std 17, 80(1) + std 18, 88(1) + std 19, 96(1) + std 20, 104(1) + std 21, 112(1) + stxv 32+20, 128(1) + stxv 32+21, 144(1) + stxv 32+22, 160(1) + stxv 32+23, 176(1) + stxv 32+24, 192(1) + stxv 32+25, 208(1) + stxv 32+26, 224(1) + stxv 32+27, 240(1) + stxv 32+28, 256(1) + stxv 32+29, 272(1) + stxv 32+30, 288(1) + stxv 32+31, 304(1) + + # init vectors and constants + # Setup for Montgomery reduce + addis 8,2,.nmkq@toc@ha + addi 8,8,.nmkq@toc@l + lxv 0, 0(8) + + lxv 32+V_QINV, 16(8) # QINV + xxlxor 32+3, 32+3, 32+3 + vspltish 4, 1 + xxlor 2, 32+2, 32+2 + xxlor 3, 32+3, 32+3 + xxlor 4, 32+4, 32+4 + + # Setup for Barrett reduce + addis 8,2,.mkq@toc@ha + addi 8,8,.mkq@toc@l + addis 9,2,.C20159@toc@ha + addi 9,9,.C20159@toc@l + addis 10,2,.C25@toc@ha + addi 10,10,.C25@toc@l + + lxv 6, 0(8) # V_MKQ + lxv 32+0, 0(9) # V20159 + lxv 7, 0(10) # V_25 + + #xxspltiw 8, 26 # for power9 and above + vspltisw 8, 13 + vadduwm 8, 8, 8 + xxlor 8, 32+8, 32+8 + + # zetas array + #addis 14,2,.izeta63@toc@ha + #addi 14,14,.izeta63@toc@l + +.align 4 +__Len2: + # + # 1. len = 2, start = 0, 4, 8, 12,...244, 248, 252 + # Update zetas vectors, each vector has 2 zetas + addis 14,2,.izeta127@toc@ha + addi 14,14,.izeta127@toc@l + li 4, 4 + li 15, 4 + mtctr 15 + li 5, 0 +__Loop2: + Load_4Coeffs 5, 16, 16 + BREDUCE_4X 4, 9, 13, 17 + xxlor 10, 32+4, 32+4 + xxlor 11, 32+9, 32+9 + xxlor 12, 32+13, 32+13 + xxlor 13, 32+17, 32+17 + Set_mont_consts + Load_next_4zetas + MREDUCE_4X V_Z0, V_Z1, V_Z2, V_Z3, 13, 18, 23, 28 + Write_Len2_4C 32+13, 32+18, 32+23, 32+28 + + addi 5, 5, 64 + + Load_4Coeffs 5, 16, 16 + BREDUCE_4X 4, 9, 13, 17 + xxlor 10, 32+4, 32+4 + xxlor 11, 32+9, 32+9 + xxlor 12, 32+13, 32+13 + xxlor 13, 32+17, 32+17 + Set_mont_consts + Load_next_4zetas + MREDUCE_4X V_Z0, V_Z1, V_Z2, V_Z3, 13, 18, 23, 28 + Write_Len2_4C 32+13, 32+18, 32+23, 32+28 + addi 5, 5, 64 + bdnz __Loop2 + +.align 4 +__Len4: + # + # 2. len = 4, start = 0, 8, 16, 24,...232, 240, 248 + addis 14,2,.izeta63@toc@ha + addi 14,14,.izeta63@toc@l + li 5, 0 + li 4, 8 + li 15, 4 # loops + mtctr 15 +__Loop4: + Load_4Coeffs 5, 16, 16 + BREDUCE_4X 4, 9, 13, 17 + xxlor 10, 32+4, 32+4 + xxlor 11, 32+9, 32+9 + xxlor 12, 32+13, 32+13 + xxlor 13, 32+17, 32+17 + Set_mont_consts + Load_next_4zetas + MREDUCE_4X V_Z0, V_Z1, V_Z2, V_Z3, 13, 18, 23, 28 + Write_Len4_4C 32+13, 32+18, 32+23, 32+28 + addi 5, 5, 64 + + Load_4Coeffs 5, 16, 16 + BREDUCE_4X 4, 9, 13, 17 + xxlor 10, 32+4, 32+4 + xxlor 11, 32+9, 32+9 + xxlor 12, 32+13, 32+13 + xxlor 13, 32+17, 32+17 + Set_mont_consts + Load_next_4zetas + MREDUCE_4X V_Z0, V_Z1, V_Z2, V_Z3, 13, 18, 23, 28 + Write_Len4_4C 32+13, 32+18, 32+23, 32+28 + addi 5, 5, 64 + bdnz __Loop4 + +.align 4 +__Len8: + # 3. len = 8, start = 0, 16, 32, 48,...208, 224, 240 + #addi 14, 14, 512 + li 4, 16 + li 5, 0 + + Load_4Coeffs 5, 32, 32 + BREDUCE_4X 4, 9, 13, 17 + Write_B4C 32+4, 32+9, 32+13, 32+17 + Set_mont_consts + Load_next_4zetas + MREDUCE_4X V_Z0, V_Z1, V_Z2, V_Z3, 13, 18, 23, 28 + Write_M4C 32+13, 32+18, 32+23, 32+28 + li 5, 128 + + Load_4Coeffs 5, 32, 32 + BREDUCE_4X 4, 9, 13, 17 + Write_B4C 32+4, 32+9, 32+13, 32+17 + Set_mont_consts + Load_next_4zetas + MREDUCE_4X V_Z0, V_Z1, V_Z2, V_Z3, 13, 18, 23, 28 + Write_M4C 32+13, 32+18, 32+23, 32+28 + li 5, 256 + + Load_4Coeffs 5, 32, 32 + BREDUCE_4X 4, 9, 13, 17 + Write_B4C 32+4, 32+9, 32+13, 32+17 + Set_mont_consts + Load_next_4zetas + MREDUCE_4X V_Z0, V_Z1, V_Z2, V_Z3, 13, 18, 23, 28 + Write_M4C 32+13, 32+18, 32+23, 32+28 + li 5, 384 + + Load_4Coeffs 5, 32, 32 + BREDUCE_4X 4, 9, 13, 17 + Write_B4C 32+4, 32+9, 32+13, 32+17 + Set_mont_consts + Load_next_4zetas + MREDUCE_4X V_Z0, V_Z1, V_Z2, V_Z3, 13, 18, 23, 28 + Write_M4C 32+13, 32+18, 32+23, 32+28 + +.align 4 +__Len16: + # + # 4. len = 16, start = 0, 32, 64,,...160, 192, 224 + #addi 14, 14, 768 + li 5, 0 + li 4, 32 + + Load_4Coeffs 5, 64, 64 + BREDUCE_4X 4, 9, 13, 17 + Write_B4C 32+4, 32+9, 32+13, 32+17 + Set_mont_consts + Load_next_4zetas + MREDUCE_4X V_Z0, V_Z1, V_Z2, V_Z3, 13, 18, 23, 28 + Write_M4C 32+13, 32+18, 32+23, 32+28 + li 5, 16 + Load_4Coeffs 5, 64, 64 + BREDUCE_4X 4, 9, 13, 17 + Write_B4C 32+4, 32+9, 32+13, 32+17 + Set_mont_consts + addi 14, 14, -64 + Load_next_4zetas + MREDUCE_4X V_Z0, V_Z1, V_Z2, V_Z3, 13, 18, 23, 28 + Write_M4C 32+13, 32+18, 32+23, 32+28 + + li 5, 256 + Load_4Coeffs 5, 64, 64 + BREDUCE_4X 4, 9, 13, 17 + Write_B4C 32+4, 32+9, 32+13, 32+17 + Set_mont_consts + Load_next_4zetas + MREDUCE_4X V_Z0, V_Z1, V_Z2, V_Z3, 13, 18, 23, 28 + Write_M4C 32+13, 32+18, 32+23, 32+28 + + li 5, 272 + Load_4Coeffs 5, 64, 64 + BREDUCE_4X 4, 9, 13, 17 + Write_B4C 32+4, 32+9, 32+13, 32+17 + Set_mont_consts + addi 14, 14, -64 + Load_next_4zetas + MREDUCE_4X V_Z0, V_Z1, V_Z2, V_Z3, 13, 18, 23, 28 + Write_M4C 32+13, 32+18, 32+23, 32+28 + +.align 4 +__Len32: + # + # 5. len = 32, start = 0, 64, 128, 192 + #addi 14, 14, 896 + li 5, 0 + li 4, 64 + + Load_4Coeffs 5, 16, 16 + BREDUCE_4X 4, 9, 13, 17 + Write_B4C 32+4, 32+9, 32+13, 32+17 + Set_mont_consts + lvx V_ZETA, 0, 14 + addi 14, 14, 16 + MREDUCE_4X V_ZETA, V_ZETA, V_ZETA, V_ZETA, 13, 18, 23, 28 + Write_M4C 32+13, 32+18, 32+23, 32+28 + li 5, 128 + + Load_4Coeffs 5, 16, 16 + BREDUCE_4X 4, 9, 13, 17 + Write_B4C 32+4, 32+9, 32+13, 32+17 + Set_mont_consts + lvx V_ZETA, 0, 14 + addi 14, 14, 16 + MREDUCE_4X V_ZETA, V_ZETA, V_ZETA, V_ZETA, 13, 18, 23, 28 + Write_M4C 32+13, 32+18, 32+23, 32+28 + li 5, 256 + + Load_4Coeffs 5, 16, 16 + BREDUCE_4X 4, 9, 13, 17 + Write_B4C 32+4, 32+9, 32+13, 32+17 + Set_mont_consts + lvx V_ZETA, 0, 14 + addi 14, 14, 16 + MREDUCE_4X V_ZETA, V_ZETA, V_ZETA, V_ZETA, 13, 18, 23, 28 + Write_M4C 32+13, 32+18, 32+23, 32+28 + li 5, 384 + + Load_4Coeffs 5, 16, 16 + BREDUCE_4X 4, 9, 13, 17 + Write_B4C 32+4, 32+9, 32+13, 32+17 + Set_mont_consts + lvx V_ZETA, 0, 14 + addi 14, 14, 16 + MREDUCE_4X V_ZETA, V_ZETA, V_ZETA, V_ZETA, 13, 18, 23, 28 + Write_M4C 32+13, 32+18, 32+23, 32+28 + +.align 4 +__Len64: + # + # 6. len = 64, start = 0, 128 + #addi 14, 14, 960 + li 5, 0 + li 4, 128 + Load_4Coeffs 5, 16, 16 + BREDUCE_4X 4, 9, 13, 17 + Write_B4C 32+4, 32+9, 32+13, 32+17 + Set_mont_consts + lvx V_ZETA, 0, 14 + addi 14, 14, 16 + MREDUCE_4X V_ZETA, V_ZETA, V_ZETA, V_ZETA, 13, 18, 23, 28 + Write_M4C 32+13, 32+18, 32+23, 32+28 + li 5, 64 + + Load_4Coeffs 5, 16, 16 + BREDUCE_4X 4, 9, 13, 17 + Write_B4C 32+4, 32+9, 32+13, 32+17 + Set_mont_consts + lxv 32+10, -16(14) + MREDUCE_4X V_ZETA, V_ZETA, V_ZETA, V_ZETA, 13, 18, 23, 28 + Write_M4C 32+13, 32+18, 32+23, 32+28 + li 5, 256 + + Load_4Coeffs 5, 16, 16 + BREDUCE_4X 4, 9, 13, 17 + Write_B4C 32+4, 32+9, 32+13, 32+17 + Set_mont_consts + lvx V_ZETA, 0, 14 + addi 14, 14, 16 + MREDUCE_4X V_ZETA, V_ZETA, V_ZETA, V_ZETA, 13, 18, 23, 28 + Write_M4C 32+13, 32+18, 32+23, 32+28 + li 5, 320 + + Load_4Coeffs 5, 16, 16 + BREDUCE_4X 4, 9, 13, 17 + Write_B4C 32+4, 32+9, 32+13, 32+17 + Set_mont_consts + lxv 32+10, -16(14) + MREDUCE_4X V_ZETA, V_ZETA, V_ZETA, V_ZETA, 13, 18, 23, 28 + Write_M4C 32+13, 32+18, 32+23, 32+28 + +.align 4 +__Len128: + # 7. len = 128, start = 0 + # + #addi 14, 14, 992 + li 5, 0 # start + li 4, 256 # len * 2 + + Load_4Coeffs 5, 16, 16 + BREDUCE_4X 4, 9, 13, 17 + Write_B4C 32+4, 32+9, 32+13, 32+17 + Set_mont_consts + lvx V_ZETA, 0, 14 + xxlor 9, 32+10, 32+10 + MREDUCE_4X V_ZETA, V_ZETA, V_ZETA, V_ZETA, 13, 18, 23, 28 + Write_M4C 32+13, 32+18, 32+23, 32+28 + li 5, 64 + + Load_4Coeffs 5, 16, 16 + BREDUCE_4X 4, 9, 13, 17 + Write_B4C 32+4, 32+9, 32+13, 32+17 + Set_mont_consts + xxlor 32+10, 9, 9 + MREDUCE_4X V_ZETA, V_ZETA, V_ZETA, V_ZETA, 13, 18, 23, 28 + Write_M4C 32+13, 32+18, 32+23, 32+28 + li 5, 128 + + Load_4Coeffs 5, 16, 16 + BREDUCE_4X 4, 9, 13, 17 + Write_B4C 32+4, 32+9, 32+13, 32+17 + Set_mont_consts + xxlor 32+10, 9, 9 + MREDUCE_4X V_ZETA, V_ZETA, V_ZETA, V_ZETA, 13, 18, 23, 28 + Write_M4C 32+13, 32+18, 32+23, 32+28 + li 5, 192 + + Load_4Coeffs 5, 16, 16 + BREDUCE_4X 4, 9, 13, 17 + Write_B4C 32+4, 32+9, 32+13, 32+17 + Set_mont_consts + xxlor 32+10, 9, 9 + MREDUCE_4X V_ZETA, V_ZETA, V_ZETA, V_ZETA, 13, 18, 23, 28 + Write_M4C 32+13, 32+18, 32+23, 32+28 + +.align 4 + # + # Montgomery reduce loops with constant 1441 + # + addis 10,2,.C1441@toc@ha + addi 10,10,.C1441@toc@l + lvx V1441, 0, 10 + + Reload_4coeffs + MREDUCE_4X V1441, V1441, V1441, V1441, 6, 7, 8, 9 + Reload_4coeffs + MREDUCE_4X V1441, V1441, V1441, V1441, 13, 18, 23, 28 + MWrite_8X 32+6, 32+7, 32+8, 32+9, 32+13, 32+18, 32+23, 32+28 + + Reload_4coeffs + MREDUCE_4X V1441, V1441, V1441, V1441, 6, 7, 8, 9 + Reload_4coeffs + MREDUCE_4X V1441, V1441, V1441, V1441, 13, 18, 23, 28 + MWrite_8X 32+6, 32+7, 32+8, 32+9, 32+13, 32+18, 32+23, 32+28 + + Reload_4coeffs + MREDUCE_4X V1441, V1441, V1441, V1441, 6, 7, 8, 9 + Reload_4coeffs + MREDUCE_4X V1441, V1441, V1441, V1441, 13, 18, 23, 28 + MWrite_8X 32+6, 32+7, 32+8, 32+9, 32+13, 32+18, 32+23, 32+28 + + Reload_4coeffs + MREDUCE_4X V1441, V1441, V1441, V1441, 6, 7, 8, 9 + Reload_4coeffs + MREDUCE_4X V1441, V1441, V1441, V1441, 13, 18, 23, 28 + MWrite_8X 32+6, 32+7, 32+8, 32+9, 32+13, 32+18, 32+23, 32+28 + +__intt_out: + lxv 32+20, 128(1) + lxv 32+21, 144(1) + lxv 32+22, 160(1) + lxv 32+23, 176(1) + lxv 32+24, 192(1) + lxv 32+25, 208(1) + lxv 32+26, 224(1) + lxv 32+27, 240(1) + lxv 32+28, 256(1) + lxv 32+29, 272(1) + lxv 32+30, 288(1) + lxv 32+31, 304(1) + ld 14, 56(1) + ld 15, 64(1) + ld 16, 72(1) + ld 16, 72(1) + ld 17, 80(1) + ld 18, 88(1) + ld 19, 96(1) + ld 20, 104(1) + ld 21, 112(1) + + mtlr 0 + addi 1, 1, 352 + blr + +.data +.align 4 +# -MLKEM_Q +.nmkq: +.short -3329, -3329, -3329, -3329, -3329, -3329, -3329, -3329 +# QINV +.short -3327, -3327, -3327, -3327, -3327, -3327, -3327, -3327 + +# MLKEM_Q +.mkq: +.short 3329, 3329, 3329, 3329, 3329, 3329, 3329, 3329 + +.C20159: +.short 20159, 20159, 20159, 20159, 20159, 20159, 20159, 20159 + +# 0x2000000 +.C25: +.long 33554432, 33554432, 33554432, 33554432 + +.C1441: +.short 1441, 1441, 1441, 1441, 1441, 1441, 1441, 1441 + +.align 4 +.izeta127: +.short 1628, 1628, 1628, 1628, 1522, 1522, 1522, 1522 +.short -1460, -1460, -1460, -1460, 958, 958, 958, 958 +.short 991, 991, 991, 991, 996, 996, 996, 996 +.short -308, -308, -308, -308, -108, -108, -108, -108 +.short 478, 478, 478, 478, -870, -870, -870, -870 +.short -854, -854, -854, -854, -1510, -1510, -1510, -1510 +.short 794, 794, 794, 794, -1278, -1278, -1278, -1278 +.short -1530, -1530, -1530, -1530, -1185, -1185, -1185, -1185 +.short -1659, -1659, -1659, -1659, -1187, -1187, -1187, -1187 +.short 220, 220, 220, 220, -874, -874, -874, -874 +.short -1335, -1335, -1335, -1335, 1218, 1218, 1218, 1218 +.short -136, -136, -136, -136, -1215, -1215, -1215, -1215 +.short 384, 384, 384, 384, -1465, -1465, -1465, -1465 +.short -1285, -1285, -1285, -1285, 1322, 1322, 1322, 1322 +.short 610, 610, 610, 610, 603, 603, 603, 603 +.short 1097, 1097, 1097, 1097, 817, 817, 817, 817 +.short -75, -75, -75, -75, -156, -156, -156, -156 +.short 329, 329, 329, 329, 418, 418, 418, 418 +.short 349, 349, 349, 349, -872, -872, -872, -872 +.short 644, 644, 644, 644, -1590, -1590, -1590, -1590 +.short 1119, 1119, 1119, 1119, -602, -602, -602, -602 +.short 1483, 1483, 1483, 1483, -777, -777, -777, -777 +.short -147, -147, -147, -147, 1159, 1159, 1159, 1159 +.short 778, 778, 778, 778, -246, -246, -246, -246 +.short 1653, 1653, 1653, 1653, 1574, 1574, 1574, 1574 +.short -460, -460, -460, -460, -291, -291, -291, -291 +.short -235, -235, -235, -235, 177, 177, 177, 177 +.short 587, 587, 587, 587, 422, 422, 422, 422 +.short 105, 105, 105, 105, 1550, 1550, 1550, 1550 +.short 871, 871, 871, 871, -1251, -1251, -1251, -1251 +.short 843, 843, 843, 843, 555, 555, 555, 555 +.short 430, 430, 430, 430, -1103, -1103, -1103, -1103 +.izeta63: +.short -1275, -1275, -1275, -1275, -1275, -1275, -1275, -1275 +.short 677, 677, 677, 677, 677, 677, 677, 677 +.short -1065, -1065, -1065, -1065, -1065, -1065, -1065, -1065 +.short 448, 448, 448, 448, 448, 448, 448, 448 +.short -725, -725, -725, -725, -725, -725, -725, -725 +.short -1508, -1508, -1508, -1508, -1508, -1508, -1508, -1508 +.short 961, 961, 961, 961, 961, 961, 961, 961 +.short -398, -398, -398, -398, -398, -398, -398, -398 +.short -951, -951, -951, -951, -951, -951, -951, -951 +.short -247, -247, -247, -247, -247, -247, -247, -247 +.short -1421, -1421, -1421, -1421, -1421, -1421, -1421, -1421 +.short 107, 107, 107, 107, 107, 107, 107, 107 +.short 830, 830, 830, 830, 830, 830, 830, 830 +.short -271, -271, -271, -271, -271, -271, -271, -271 +.short -90, -90, -90, -90, -90, -90, -90, -90 +.short -853, -853, -853, -853, -853, -853, -853, -853 +.short 1469, 1469, 1469, 1469, 1469, 1469, 1469, 1469 +.short 126, 126, 126, 126, 126, 126, 126, 126 +.short -1162, -1162, -1162, -1162, -1162, -1162, -1162, -1162 +.short -1618, -1618, -1618, -1618, -1618, -1618, -1618, -1618 +.short -666, -666, -666, -666, -666, -666, -666, -666 +.short -320, -320, -320, -320, -320, -320, -320, -320 +.short -8, -8, -8, -8, -8, -8, -8, -8 +.short 516, 516, 516, 516, 516, 516, 516, 516 +.short -1544, -1544, -1544, -1544, -1544, -1544, -1544, -1544 +.short -282, -282, -282, -282, -282, -282, -282, -282 +.short 1491, 1491, 1491, 1491, 1491, 1491, 1491, 1491 +.short -1293, -1293, -1293, -1293, -1293, -1293, -1293, -1293 +.short 1015, 1015, 1015, 1015, 1015, 1015, 1015, 1015 +.short -552, -552, -552, -552, -552, -552, -552, -552 +.short 652, 652, 652, 652, 652, 652, 652, 652 +.short 1223, 1223, 1223, 1223, 1223, 1223, 1223, 1223 +.short -1571, -1571, -1571, -1571, -1571, -1571, -1571, -1571 +.short -205, -205, -205, -205, -205, -205, -205, -205 +.short 411, 411, 411, 411, 411, 411, 411, 411 +.short -1542, -1542, -1542, -1542, -1542, -1542, -1542, -1542 +.short 608, 608, 608, 608, 608, 608, 608, 608 +.short 732, 732, 732, 732, 732, 732, 732, 732 +.short 1017, 1017, 1017, 1017, 1017, 1017, 1017, 1017 +.short -681, -681, -681, -681, -681, -681, -681, -681 +.short -130, -130, -130, -130, -130, -130, -130, -130 +.short -1602, -1602, -1602, -1602, -1602, -1602, -1602, -1602 +.short 1458, 1458, 1458, 1458, 1458, 1458, 1458, 1458 +.short -829, -829, -829, -829, -829, -829, -829, -829 +.short 383, 383, 383, 383, 383, 383, 383, 383 +.short 264, 264, 264, 264, 264, 264, 264, 264 +.short -1325, -1325, -1325, -1325, -1325, -1325, -1325, -1325 +.short 573, 573, 573, 573, 573, 573, 573, 573 +.short 1468, 1468, 1468, 1468, 1468, 1468, 1468, 1468 +.short -1474, -1474, -1474, -1474, -1474, -1474, -1474, -1474 +.short -1202, -1202, -1202, -1202, -1202, -1202, -1202, -1202 +.short 962, 962, 962, 962, 962, 962, 962, 962 +.short 182, 182, 182, 182, 182, 182, 182, 182 +.short 1577, 1577, 1577, 1577, 1577, 1577, 1577, 1577 +.short 622, 622, 622, 622, 622, 622, 622, 622 +.short -171, -171, -171, -171, -171, -171, -171, -171 +.short 202, 202, 202, 202, 202, 202, 202, 202 +.short 287, 287, 287, 287, 287, 287, 287, 287 +.short 1422, 1422, 1422, 1422, 1422, 1422, 1422, 1422 +.short 1493, 1493, 1493, 1493, 1493, 1493, 1493, 1493 +.short -1517, -1517, -1517, -1517, -1517, -1517, -1517, -1517 +.short -359, -359, -359, -359, -359, -359, -359, -359 +.short -758, -758, -758, -758, -758, -758, -758, -758 diff --git a/mlkem/src/native/ppc64le/src/ntt_ppc.S b/mlkem/src/native/ppc64le/src/ntt_ppc.S new file mode 100644 index 000000000..172fef9cc --- /dev/null +++ b/mlkem/src/native/ppc64le/src/ntt_ppc.S @@ -0,0 +1,498 @@ +/* + * Copyright (c) The mlkem-native project authors + * SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT + */ + +# +# Copyright 2025- IBM Corp. +# +#=================================================================================== +# Written by Danny Tsen +# + +#include "../../../common.h" + +#define V_QINV 2 +#define V_NMKQ 5 +#define V_Z0 7 +#define V_Z1 8 +#define V_Z2 9 +#define V_Z3 10 +#define V_ZETA 10 + +.machine "any" +.text + +# +# montgomery_reduce +# t = a * QINV +# t = (a - (int32_t)t*_MLKEM_Q) >> 16 +# +#----------------------------------- +# MREDUCE_4X(start, _vz0, _vz1, _vz2, _vz3) +# +.macro MREDUCE_4X start next step _vz0 _vz1 _vz2 _vz3 + mr 9, \start + add 10, 4, 9 # J + len*2 + addi 16, 9, \next + addi 17, 10, \step + addi 18, 16, \next + addi 19, 17, \step + addi 20, 18, \next + addi 21, 19, \step + lxvd2x 32+13, 3, 10 # r[j+len] + lxvd2x 32+18, 3, 17 # r[j+len] + lxvd2x 32+23, 3, 19 # r[j+len] + lxvd2x 32+28, 3, 21 # r[j+len] + xxpermdi 32+13, 32+13, 32+13, 2 + xxpermdi 32+18, 32+18, 32+18, 2 + xxpermdi 32+23, 32+23, 32+23, 2 + xxpermdi 32+28, 32+28, 32+28, 2 + + # fqmul = zeta * coefficient + vmladduhm 15, 13, \_vz0, 3 + vmladduhm 20, 18, \_vz1, 3 + vmladduhm 25, 23, \_vz2, 3 + vmladduhm 30, 28, \_vz3, 3 + + vmhraddshs 14, 13, \_vz0, 3 + vmhraddshs 19, 18, \_vz1, 3 + vmhraddshs 24, 23, \_vz2, 3 + vmhraddshs 29, 28, \_vz3, 3 + + vmladduhm 15, 15, V_QINV, 3 + vmladduhm 20, 20, V_QINV, 3 + vmladduhm 25, 25, V_QINV, 3 + vmladduhm 30, 30, V_QINV, 3 + + vmhraddshs 15, 15, V_NMKQ, 14 + vmhraddshs 20, 20, V_NMKQ, 19 + vmhraddshs 25, 25, V_NMKQ, 24 + vmhraddshs 30, 30, V_NMKQ, 29 + + vsrah 13, 15, 4 # >> 1 + vsrah 18, 20, 4 # >> 1 + vsrah 23, 25, 4 # >> 1 + vsrah 28, 30, 4 # >> 1 + + lxvd2x 32+12, 3, 9 # r[j] + lxvd2x 32+17, 3, 16 # r[j] + lxvd2x 32+22, 3, 18 # r[j] + lxvd2x 32+27, 3, 20 # r[j] + xxpermdi 32+12, 32+12, 32+12, 2 + xxpermdi 32+17, 32+17, 32+17, 2 + xxpermdi 32+22, 32+22, 32+22, 2 + xxpermdi 32+27, 32+27, 32+27, 2 + + vsubuhm 16, 12, 13 # r - t + vadduhm 15, 13, 12 # r + t + vsubuhm 21, 17, 18 # r - t + vadduhm 20, 18, 17 # r + t + vsubuhm 26, 22, 23 # r - t + vadduhm 25, 23, 22 # r + t + vsubuhm 31, 27, 28 # r - t + vadduhm 30, 28, 27 # r + t +.endm + +.macro Write_One + stxvx 32+15, 3, 9 + stxvx 32+16, 3, 10 + stxvx 32+20, 3, 16 + stxvx 32+21, 3, 17 + stxvx 32+25, 3, 18 + stxvx 32+26, 3, 19 + stxvx 32+30, 3, 20 + stxvx 32+31, 3, 21 +.endm + +.macro Write_Two + xxpermdi 32+17, 32+16, 32+15, 3 + xxpermdi 32+22, 32+21, 32+20, 3 + xxpermdi 32+27, 32+26, 32+25, 3 + xxpermdi 32+29, 32+31, 32+30, 3 + + stxvx 32+17, 3, 9 + stxvx 32+22, 3, 16 + stxvx 32+27, 3, 18 + stxvx 32+29, 3, 20 +.endm + +.macro Write_Three + xxmrglw 32+14, 32+16, 32+15 + xxmrghw 32+13, 32+16, 32+15 + xxpermdi 32+17, 32+13, 32+14, 3 + xxmrglw 32+19, 32+21, 32+20 + xxmrghw 32+18, 32+21, 32+20 + xxpermdi 32+22, 32+18, 32+19, 3 + xxmrglw 32+14, 32+26, 32+25 + xxmrghw 32+13, 32+26, 32+25 + xxpermdi 32+27, 32+13, 32+14, 3 + xxmrglw 32+24, 32+31, 32+30 + xxmrghw 32+23, 32+31, 32+30 + xxpermdi 32+29, 32+23, 32+24, 3 + stxvx 32+17, 3, 9 + stxvx 32+22, 3, 16 + stxvx 32+27, 3, 18 + stxvx 32+29, 3, 20 +.endm + +.macro Load_next_4zetas + lxv 32+V_Z0, 0(14) + lxv 32+V_Z1, 16(14) + lxv 32+V_Z2, 32(14) + lxv 32+V_Z3, 48(14) + addi 14, 14, 64 +.endm + +# +# mlk_ntt_ppc(int16_t *r) +# +.global MLK_ASM_NAMESPACE(ntt_ppc) +.align 4 +MLK_ASM_FN_SYMBOL(ntt_ppc) + + stdu 1, -352(1) + mflr 0 + std 14, 56(1) + std 15, 64(1) + std 16, 72(1) + std 17, 80(1) + std 18, 88(1) + std 19, 96(1) + std 20, 104(1) + std 21, 112(1) + stxv 32+20, 128(1) + stxv 32+21, 144(1) + stxv 32+22, 160(1) + stxv 32+23, 176(1) + stxv 32+24, 192(1) + stxv 32+25, 208(1) + stxv 32+26, 224(1) + stxv 32+27, 240(1) + stxv 32+28, 256(1) + stxv 32+29, 272(1) + stxv 32+30, 288(1) + stxv 32+31, 304(1) + + # get MLKEM_Q + addis 8,2,.nmkq@toc@ha + addi 8,8,.nmkq@toc@l + lvx V_NMKQ,0,8 + + # zetas array + addis 14,2,.K1@toc@ha + addi 14,14,.K1@toc@l + + vxor 3, 3, 3 + vspltish 4, 1 + lxv 32+V_QINV, 16(8) + +.align 4 +__Len128: + # + # Compute coefficients of the NTT based on the following loop. + # for (len = 128; len ≥ 2; len = len/2) + # + # 1. len = 128, start = 0 + # + li 5, 0 # start + li 4, 256 # len * 2 + lvx V_ZETA, 0, 14 + addi 14, 14, 16 + + MREDUCE_4X 5, 16, 16, V_ZETA, V_ZETA, V_ZETA, V_ZETA + Write_One + li 5, 64 + MREDUCE_4X 5, 16, 16, V_ZETA, V_ZETA, V_ZETA, V_ZETA + Write_One + li 5, 128 + MREDUCE_4X 5, 16, 16, V_ZETA, V_ZETA, V_ZETA, V_ZETA + Write_One + li 5, 192 + MREDUCE_4X 5, 16, 16, V_ZETA, V_ZETA, V_ZETA, V_ZETA + Write_One + +.align 4 +__Len64: + # + # 2. len = 64, start = 0, 128 + # k += 2 + li 5, 0 + li 4, 128 + lvx V_ZETA, 0, 14 + addi 14, 14, 16 + MREDUCE_4X 5, 16, 16, V_ZETA, V_ZETA, V_ZETA, V_ZETA + Write_One + li 5, 64 + MREDUCE_4X 5, 16, 16, V_ZETA, V_ZETA, V_ZETA, V_ZETA + Write_One + li 5, 256 + + lvx V_ZETA, 0, 14 + addi 14, 14, 16 + MREDUCE_4X 5, 16, 16, V_ZETA, V_ZETA, V_ZETA, V_ZETA + Write_One + li 5, 320 + MREDUCE_4X 5, 16, 16, V_ZETA, V_ZETA, V_ZETA, V_ZETA + Write_One + +.align 4 +__Len32: + # + # 3. len = 32, start = 0, 64, 128, 192 + # k += 4 + li 5, 0 + li 4, 64 + lvx V_ZETA, 0, 14 + addi 14, 14, 16 + MREDUCE_4X 5, 16, 16, V_ZETA, V_ZETA, V_ZETA, V_ZETA + Write_One + #li 5, 64 + li 5, 128 + + lvx V_ZETA, 0, 14 + addi 14, 14, 16 + MREDUCE_4X 5, 16, 16, V_ZETA, V_ZETA, V_ZETA, V_ZETA + Write_One + #li 5, 128 + li 5, 256 + + lvx V_ZETA, 0, 14 + addi 14, 14, 16 + MREDUCE_4X 5, 16, 16, V_ZETA, V_ZETA, V_ZETA, V_ZETA + Write_One + #li 5, 192 + li 5, 384 + + lvx V_ZETA, 0, 14 + addi 14, 14, 16 + MREDUCE_4X 5, 16, 16, V_ZETA, V_ZETA, V_ZETA, V_ZETA + Write_One + +.align 4 +__Len16: + # + # 4. len = 16, start = 0, 32, 64,,...160, 192, 224 + # k += 8 + li 5, 0 + li 4, 32 + Load_next_4zetas + MREDUCE_4X 5, 64, 64, V_Z0, V_Z1, V_Z2, V_Z3 + Write_One + li 5, 16 + MREDUCE_4X 5, 64, 64, V_Z0, V_Z1, V_Z2, V_Z3 + Write_One + + Load_next_4zetas + li 5, 256 + MREDUCE_4X 5, 64, 64, V_Z0, V_Z1, V_Z2, V_Z3 + Write_One + li 5, 272 + MREDUCE_4X 5, 64, 64, V_Z0, V_Z1, V_Z2, V_Z3 + Write_One + +.align 4 +__Len8: + # + # 5. len = 8, start = 0, 16, 32, 48,...208, 224, 240 + # k += 16 + li 5, 0 + li 4, 16 + Load_next_4zetas + MREDUCE_4X 5, 32, 32, V_Z0, V_Z1, V_Z2, V_Z3 + Write_One + li 5, 128 + + Load_next_4zetas + MREDUCE_4X 5, 32, 32, V_Z0, V_Z1, V_Z2, V_Z3 + Write_One + li 5, 256 + + Load_next_4zetas + MREDUCE_4X 5, 32, 32, V_Z0, V_Z1, V_Z2, V_Z3 + Write_One + li 5, 384 + + Load_next_4zetas + MREDUCE_4X 5, 32, 32, V_Z0, V_Z1, V_Z2, V_Z3 + Write_One + + # + # 6. len = 4, start = 0, 8, 16, 24,...232, 240, 248 + # k += 32 + li 15, 4 # loops + mtctr 15 + li 5, 0 + li 4, 8 +.align 4 +__Len4: + Load_next_4zetas + MREDUCE_4X 5, 16, 16, V_Z0, V_Z1, V_Z2, V_Z3 + Write_Two + addi 5, 5, 64 + + Load_next_4zetas + MREDUCE_4X 5, 16, 16, V_Z0, V_Z1, V_Z2, V_Z3 + Write_Two + addi 5, 5, 64 + + bdnz __Len4 + + # + # 7. len = 2, start = 0, 4, 8, 12,...244, 248, 252 + # k += 64 + # Update zetas vectors, each vector has 2 zetas + addis 14,2,.K64@toc@ha + addi 14,14,.K64@toc@l + + li 15, 4 + mtctr 15 + li 5, 0 + li 4, 4 +.align 4 +__Len2: + Load_next_4zetas + MREDUCE_4X 5, 16, 16, V_Z0, V_Z1, V_Z2, V_Z3 + Write_Three + addi 5, 5, 64 + + Load_next_4zetas + MREDUCE_4X 5, 16, 16, V_Z0, V_Z1, V_Z2, V_Z3 + Write_Three + addi 5, 5, 64 + + bdnz __Len2 + +__ntt_out: + lxv 32+20, 128(1) + lxv 32+21, 144(1) + lxv 32+22, 160(1) + lxv 32+23, 176(1) + lxv 32+24, 192(1) + lxv 32+25, 208(1) + lxv 32+26, 224(1) + lxv 32+27, 240(1) + lxv 32+28, 256(1) + lxv 32+29, 272(1) + lxv 32+30, 288(1) + lxv 32+31, 304(1) + ld 14, 56(1) + ld 15, 64(1) + ld 16, 72(1) + ld 16, 72(1) + ld 17, 80(1) + ld 18, 88(1) + ld 19, 96(1) + ld 20, 104(1) + ld 21, 112(1) + + mtlr 0 + addi 1, 1, 352 + blr + +.data +.align 4 +# -MLKEM_Q +.nmkq: +.short -3329, -3329, -3329, -3329, -3329, -3329, -3329, -3329 +# QINV +.short -3327, -3327, -3327, -3327, -3327, -3327, -3327, -3327 + +# zetas +.K1: +.short -758, -758, -758, -758, -758, -758, -758, -758 +.short -359, -359, -359, -359, -359, -359, -359, -359 +.short -1517, -1517, -1517, -1517, -1517, -1517, -1517, -1517 +.short 1493, 1493, 1493, 1493, 1493, 1493, 1493, 1493 +.short 1422, 1422, 1422, 1422, 1422, 1422, 1422, 1422 +.short 287, 287, 287, 287, 287, 287, 287, 287 +.short 202, 202, 202, 202, 202, 202, 202, 202 +.short -171, -171, -171, -171, -171, -171, -171, -171 +.short 622, 622, 622, 622, 622, 622, 622, 622 +.short 1577, 1577, 1577, 1577, 1577, 1577, 1577, 1577 +.short 182, 182, 182, 182, 182, 182, 182, 182 +.short 962, 962, 962, 962, 962, 962, 962, 962 +.short -1202, -1202, -1202, -1202, -1202, -1202, -1202, -1202 +.short -1474, -1474, -1474, -1474, -1474, -1474, -1474, -1474 +.short 1468, 1468, 1468, 1468, 1468, 1468, 1468, 1468 +.short 573, 573, 573, 573, 573, 573, 573, 573 +.short -1325, -1325, -1325, -1325, -1325, -1325, -1325, -1325 +.short 264, 264, 264, 264, 264, 264, 264, 264 +.short 383, 383, 383, 383, 383, 383, 383, 383 +.short -829, -829, -829, -829, -829, -829, -829, -829 +.short 1458, 1458, 1458, 1458, 1458, 1458, 1458, 1458 +.short -1602, -1602, -1602, -1602, -1602, -1602, -1602, -1602 +.short -130, -130, -130, -130, -130, -130, -130, -130 +.short -681, -681, -681, -681, -681, -681, -681, -681 +.short 1017, 1017, 1017, 1017, 1017, 1017, 1017, 1017 +.short 732, 732, 732, 732, 732, 732, 732, 732 +.short 608, 608, 608, 608, 608, 608, 608, 608 +.short -1542, -1542, -1542, -1542, -1542, -1542, -1542, -1542 +.short 411, 411, 411, 411, 411, 411, 411, 411 +.short -205, -205, -205, -205, -205, -205, -205, -205 +.short -1571, -1571, -1571, -1571, -1571, -1571, -1571, -1571 +.short 1223, 1223, 1223, 1223, 1223, 1223, 1223, 1223 +.short 652, 652, 652, 652, 652, 652, 652, 652 +.short -552, -552, -552, -552, -552, -552, -552, -552 +.short 1015, 1015, 1015, 1015, 1015, 1015, 1015, 1015 +.short -1293, -1293, -1293, -1293, -1293, -1293, -1293, -1293 +.short 1491, 1491, 1491, 1491, 1491, 1491, 1491, 1491 +.short -282, -282, -282, -282, -282, -282, -282, -282 +.short -1544, -1544, -1544, -1544, -1544, -1544, -1544, -1544 +.short 516, 516, 516, 516, 516, 516, 516, 516 +.short -8, -8, -8, -8, -8, -8, -8, -8 +.short -320, -320, -320, -320, -320, -320, -320, -320 +.short -666, -666, -666, -666, -666, -666, -666, -666 +.short -1618, -1618, -1618, -1618, -1618, -1618, -1618, -1618 +.short -1162, -1162, -1162, -1162, -1162, -1162, -1162, -1162 +.short 126, 126, 126, 126, 126, 126, 126, 126 +.short 1469, 1469, 1469, 1469, 1469, 1469, 1469, 1469 +.short -853, -853, -853, -853, -853, -853, -853, -853 +.short -90, -90, -90, -90, -90, -90, -90, -90 +.short -271, -271, -271, -271, -271, -271, -271, -271 +.short 830, 830, 830, 830, 830, 830, 830, 830 +.short 107, 107, 107, 107, 107, 107, 107, 107 +.short -1421, -1421, -1421, -1421, -1421, -1421, -1421, -1421 +.short -247, -247, -247, -247, -247, -247, -247, -247 +.short -951, -951, -951, -951, -951, -951, -951, -951 +.short -398, -398, -398, -398, -398, -398, -398, -398 +.short 961, 961, 961, 961, 961, 961, 961, 961 +.short -1508, -1508, -1508, -1508, -1508, -1508, -1508, -1508 +.short -725, -725, -725, -725, -725, -725, -725, -725 +.short 448, 448, 448, 448, 448, 448, 448, 448 +.short -1065, -1065, -1065, -1065, -1065, -1065, -1065, -1065 +.short 677, 677, 677, 677, 677, 677, 677, 677 +.short -1275, -1275, -1275, -1275, -1275, -1275, -1275, -1275 +.K64: +.short -1103, -1103, -1103, -1103, 430, 430, 430, 430 +.short 555, 555, 555, 555, 843, 843, 843, 843 +.short -1251, -1251, -1251, -1251, 871, 871, 871, 871 +.short 1550, 1550, 1550, 1550, 105, 105, 105, 105 +.short 422, 422, 422, 422, 587, 587, 587, 587 +.short 177, 177, 177, 177, -235, -235, -235, -235 +.short -291, -291, -291, -291, -460, -460, -460, -460 +.short 1574, 1574, 1574, 1574, 1653, 1653, 1653, 1653 +.short -246, -246, -246, -246, 778, 778, 778, 778 +.short 1159, 1159, 1159, 1159, -147, -147, -147, -147 +.short -777, -777, -777, -777, 1483, 1483, 1483, 1483 +.short -602, -602, -602, -602, 1119, 1119, 1119, 1119 +.short -1590, -1590, -1590, -1590, 644, 644, 644, 644 +.short -872, -872, -872, -872, 349, 349, 349, 349 +.short 418, 418, 418, 418, 329, 329, 329, 329 +.short -156, -156, -156, -156, -75, -75, -75, -75 +.short 817, 817, 817, 817, 1097, 1097, 1097, 1097 +.short 603, 603, 603, 603, 610, 610, 610, 610 +.short 1322, 1322, 1322, 1322, -1285, -1285, -1285, -1285 +.short -1465, -1465, -1465, -1465, 384, 384, 384, 384 +.short -1215, -1215, -1215, -1215, -136, -136, -136, -136 +.short 1218, 1218, 1218, 1218, -1335, -1335, -1335, -1335 +.short -874, -874, -874, -874, 220, 220, 220, 220 +.short -1187, -1187, -1187, -1187, -1659, -1659, -1659, -1659 +.short -1185, -1185, -1185, -1185, -1530, -1530, -1530, -1530 +.short -1278, -1278, -1278, -1278, 794, 794, 794, 794 +.short -1510, -1510, -1510, -1510, -854, -854, -854, -854 +.short -870, -870, -870, -870, 478, 478, 478, 478 +.short -108, -108, -108, -108, -308, -308, -308, -308 +.short 996, 996, 996, 996, 991, 991, 991, 991 +.short 958, 958, 958, 958, -1460, -1460, -1460, -1460 +.short 1522, 1522, 1522, 1522, 1628, 1628, 1628, 1628 diff --git a/mlkem/src/native/ppc64le/src/poly_tomont.S b/mlkem/src/native/ppc64le/src/poly_tomont.S new file mode 100644 index 000000000..c07f25c5a --- /dev/null +++ b/mlkem/src/native/ppc64le/src/poly_tomont.S @@ -0,0 +1,163 @@ +/* + * Copyright (c) The mlkem-native project authors + * SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT + */ + +# +# Copyright 2025- IBM Corp. +# +#=================================================================================== +# Written by Danny Tsen +# + +# Poly_tomont: Inplace conversion of all coefficients of a polynomial +# from normal domain to Montgomery domain +# +# Arguments:*r: pointer to input/output polynomial +# + +#include "../../../common.h" + +#define V1353 0 +#define V_QINV 2 +#define V_NMKQ 5 + +.machine "any" +.text + +# +# montgomery_reduce +# t = a * QINV +# t = (a - (int32_t)t*_MLKEM_Q) >> 16 +# +#----------------------------------- +# MREDUCE_4X(_v0, _v1, _v2, _v3) +# +.macro MREDUCE_4X _v0 _v1 _v2 _v3 + lxvd2x 32+13, 0, 3 + addi 3, 3, 16 + lxvd2x 32+18, 0, 3 + addi 3, 3, 16 + lxvd2x 32+23, 0, 3 + addi 3, 3, 16 + lxvd2x 32+7, 0, 3 + addi 3, 3, 16 + + vmladduhm 15, 13, V1353, 3 + vmladduhm 20, 18, V1353, 3 + vmladduhm 25, 23, V1353, 3 + vmladduhm 9, 7, V1353, 3 + + vmhraddshs 14, 13, V1353, 3 + vmhraddshs 19, 18, V1353, 3 + vmhraddshs 24, 23, V1353, 3 + vmhraddshs 8, 7, V1353, 3 + + vmladduhm 15, 15, V_QINV, 3 + vmladduhm 20, 20, V_QINV, 3 + vmladduhm 25, 25, V_QINV, 3 + vmladduhm 9, 9, V_QINV, 3 + + vmhraddshs 15, 15, V_NMKQ, 14 + vmhraddshs 20, 20, V_NMKQ, 19 + vmhraddshs 25, 25, V_NMKQ, 24 + vmhraddshs 9, 9, V_NMKQ, 8 + + vsrah \_v0, 15, 4 # >> 1 + vsrah \_v1, 20, 4 # >> 1 + vsrah \_v2, 25, 4 # >> 1 + vsrah \_v3, 9, 4 # >> 1 +.endm + +.macro Write_8X + stxvd2x 32+27, 4, 3 + stxvd2x 32+28, 5, 3 + stxvd2x 32+29, 6, 3 + stxvd2x 32+30, 7, 3 + stxvd2x 32+13, 8, 3 + stxvd2x 32+18, 9, 3 + stxvd2x 32+23, 10, 3 + stxvd2x 32+7, 11, 3 +.endm + +.align 4 +.globl MLK_ASM_NAMESPACE(poly_tomont_ppc) +MLK_ASM_FN_SYMBOL(poly_tomont_ppc) + stdu 1, -320(1) + mflr 0 + + stxv 32+20, 128(1) + stxv 32+21, 144(1) + stxv 32+22, 160(1) + stxv 32+23, 176(1) + stxv 32+24, 192(1) + stxv 32+25, 208(1) + stxv 32+26, 224(1) + stxv 32+27, 240(1) + stxv 32+28, 256(1) + stxv 32+29, 272(1) + stxv 32+30, 288(1) + + addis 9,2,.nmkq@toc@ha + addi 9,9,.nmkq@toc@l + addis 10,2,.C1353@toc@ha + addi 10,10,.C1353@toc@l + + lxv 32+V_NMKQ,0(9) + lxv 32+V_QINV,16(9) + lxv 32+V1353,0(10) + + vxor 3, 3, 3 + vspltish 4, 1 + + li 4, -128 + li 5, -112 + li 6, -96 + li 7, -80 + li 8, -64 + li 9, -48 + li 10, -32 + li 11, -16 + + MREDUCE_4X 27, 28, 29, 30 + MREDUCE_4X 13, 18, 23, 7 + Write_8X + + MREDUCE_4X 27, 28, 29, 30 + MREDUCE_4X 13, 18, 23, 7 + Write_8X + + MREDUCE_4X 27, 28, 29, 30 + MREDUCE_4X 13, 18, 23, 7 + Write_8X + + MREDUCE_4X 27, 28, 29, 30 + MREDUCE_4X 13, 18, 23, 7 + Write_8X + + lxv 32+20, 128(1) + lxv 32+21, 144(1) + lxv 32+22, 160(1) + lxv 32+23, 176(1) + lxv 32+24, 192(1) + lxv 32+25, 208(1) + lxv 32+26, 224(1) + lxv 32+27, 240(1) + lxv 32+28, 256(1) + lxv 32+29, 272(1) + lxv 32+30, 288(1) + mtlr 0 + addi 1, 1, 320 + blr + +.data +.align 4 +# -MLKEM_Q +.nmkq: +.short -3329, -3329, -3329, -3329, -3329, -3329, -3329, -3329 +# QINV +.short -3327, -3327, -3327, -3327, -3327, -3327, -3327, -3327 + +.C1353: +.short 1353, 1353, 1353, 1353, 1353, 1353, 1353, 1353 + diff --git a/mlkem/src/native/ppc64le/src/reduce.S b/mlkem/src/native/ppc64le/src/reduce.S new file mode 100644 index 000000000..ee8e1fdca --- /dev/null +++ b/mlkem/src/native/ppc64le/src/reduce.S @@ -0,0 +1,225 @@ +/* + * Copyright (c) The mlkem-native project authors + * SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT + */ + +# +# Copyright 2025- IBM Corp. +# +#=================================================================================== +# Written by Danny Tsen +# + +# +# poly_reduce: Applies Barrett reduction to all coefficients of a polynomial +# for details of the Barrett reduction +# +# Arguments: *r: pointer to input/output polynomial +# + +#include "../../../common.h" + +# Barrett reduce constatnts +#define V20159 0 +#define V_25 1 +#define V_26 2 +#define V_MKQ 3 + +.machine "any" +.text + +.macro BREDUCE_4X _v0 _v1 _v2 _v3 + lxvd2x 32+8, 0, 3 + lxvd2x 32+12, 14, 3 + lxvd2x 32+16, 15, 3 + lxvd2x 32+20, 16, 3 + addi 3, 3, 64 + vmulosh 6, 8, V20159 + vmulesh 5, 8, V20159 + vmulosh 11, 12, V20159 + vmulesh 10, 12, V20159 + vmulosh 15, 16, V20159 + vmulesh 14, 16, V20159 + vmulosh 19, 20, V20159 + vmulesh 18, 20, V20159 + xxmrglw 32+4, 32+5, 32+6 + xxmrghw 32+5, 32+5, 32+6 + xxmrglw 32+9, 32+10, 32+11 + xxmrghw 32+10, 32+10, 32+11 + xxmrglw 32+13, 32+14, 32+15 + xxmrghw 32+14, 32+14, 32+15 + xxmrglw 32+17, 32+18, 32+19 + xxmrghw 32+18, 32+18, 32+19 + vadduwm 4, 4, V_25 + vadduwm 5, 5, V_25 + vadduwm 9, 9, V_25 + vadduwm 10, 10, V_25 + vadduwm 13, 13, V_25 + vadduwm 14, 14, V_25 + vadduwm 17, 17, V_25 + vadduwm 18, 18, V_25 + vsraw 4, 4, V_26 + vsraw 5, 5, V_26 + vsraw 9, 9, V_26 + vsraw 10, 10, V_26 + vsraw 13, 13, V_26 + vsraw 14, 14, V_26 + vsraw 17, 17, V_26 + vsraw 18, 18, V_26 + vpkuwum 4, 5, 4 + vsubuhm 4, 7, 4 + vpkuwum 9, 10, 9 + vsubuhm 9, 7, 9 + vpkuwum 13, 14, 13 + vsubuhm 13, 7, 13 + vpkuwum 17, 18, 17 + vsubuhm 17, 7, 17 + vmladduhm \_v0, 4, V_MKQ, 8 + vmladduhm \_v1, 9, V_MKQ, 12 + vmladduhm \_v2, 13, V_MKQ, 16 + vmladduhm \_v3, 17, V_MKQ, 20 +.endm + +.macro Write_8X + stxvd2x 32+21, 4, 3 + stxvd2x 32+22, 5, 3 + stxvd2x 32+23, 6, 3 + stxvd2x 32+24, 7, 3 + stxvd2x 32+4, 8, 3 + stxvd2x 32+9, 9, 3 + stxvd2x 32+13, 10, 3 + stxvd2x 32+17, 11, 3 +.endm + +# +# Conditional addition to get unsigned canonical representative +# +.macro To_unsigned_16 + lxv 32+12, 0(3) + lxv 32+13, 16(3) + lxv 32+14, 32(3) + lxv 32+15, 48(3) + addi 3, 3, 64 + vsrh 1, 12, 10 + vsrh 0, 13, 10 + vsrh 3, 14, 10 + vsrh 2, 15, 10 + vadduhm 7, 12, 11 + vadduhm 8, 13, 11 + vadduhm 5, 14, 11 + vadduhm 6, 15, 11 + vcmpequh 1, 1, 9 + vcmpequh 0, 0, 9 + vcmpequh 3, 3, 9 + vcmpequh 2, 2, 9 + xxsel 32+1, 32+7,32+12, 32+1 + xxsel 32+0, 32+8,32+13, 32+0 + xxsel 32+3, 32+5,32+14, 32+3 + xxsel 32+2, 32+6,32+15, 32+2 + stxv 32+3, -32(3) + stxv 32+2, -16(3) + stxv 32+1, -64(3) + stxv 32+0, -48(3) +.endm + +.align 4 +.globl MLK_ASM_NAMESPACE(reduce_ppc) +MLK_ASM_FN_SYMBOL(reduce_ppc) + stdu 1, -224(1) + mflr 0 + std 14, 96(1) + std 15, 104(1) + std 16, 112(1) + stxv 32+20, 128(1) + stxv 32+21, 144(1) + stxv 32+22, 160(1) + stxv 32+23, 176(1) + stxv 32+24, 192(1) + + addis 8,2,.mkq@toc@ha + addi 8,8,.mkq@toc@l + addis 9,2,.C20159@toc@ha + addi 9,9,.C20159@toc@l + addis 10,2,.C25@toc@ha + addi 10,10,.C25@toc@l + + vxor 7, 7, 7 + + lxv 32+V_MKQ, 0(8) + lxv 32+V20159, 0(9) + lxv 32+V_25, 0(10) + + li 4, -128 + li 5, -112 + li 6, -96 + li 7, -80 + li 8, -64 + li 9, -48 + li 10, -32 + li 11, -16 + + li 14, 16 + li 15, 32 + li 16, 48 + + vspltisw V_26, 13 + vadduwm V_26, V_26, V_26 + + BREDUCE_4X 21, 22, 23, 24 + BREDUCE_4X 4, 9, 13, 17 + Write_8X + + BREDUCE_4X 21, 22, 23, 24 + BREDUCE_4X 4, 9, 13, 17 + Write_8X + + BREDUCE_4X 21, 22, 23, 24 + BREDUCE_4X 4, 9, 13, 17 + Write_8X + + BREDUCE_4X 21, 22, 23, 24 + BREDUCE_4X 4, 9, 13, 17 + Write_8X + + # + # To unsigned canonical + # +.align 4 + addi 3, 3, -512 + xxspltib 32+9 ,0 + vspltish 10, 15 + vmr 11, V_MKQ + + To_unsigned_16 + To_unsigned_16 + To_unsigned_16 + To_unsigned_16 + To_unsigned_16 + To_unsigned_16 + To_unsigned_16 + To_unsigned_16 + + ld 14, 96(1) + ld 15, 104(1) + ld 16, 112(1) + lxv 32+20, 128(1) + lxv 32+21, 144(1) + lxv 32+22, 160(1) + lxv 32+23, 176(1) + lxv 32+24, 192(1) + mtlr 0 + addi 1, 1, 224 + blr + +.align 4 +.data +# MLKEM_Q +.mkq: +.short 3329, 3329, 3329, 3329, 3329, 3329, 3329, 3329 + +.C20159: +.short 20159, 20159, 20159, 20159, 20159, 20159, 20159, 20159 + +# 0x2000000 +.C25: +.long 33554432, 33554432, 33554432, 33554432 diff --git a/test/mk/auto.mk b/test/mk/auto.mk index bcbf3ac1c..b66eb724b 100644 --- a/test/mk/auto.mk +++ b/test/mk/auto.mk @@ -1,113 +1,33 @@ # SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT # -# Automatically detect system architecture and set preprocessor flags accordingly -# This file detects host CPU capabilities and combines them with compiler support -# to enable optimal compilation flags. +# Automatically detect system architecture and set preprocessor etc accordingly -ifndef _AUTO_MK -_AUTO_MK := - -# Helper function to check if host CPU supports a feature -# Usage: $(call check_host_feature,feature_pattern,source_command) -define check_host_feature -$(shell $(2) 2>/dev/null | grep -q "$(1)" && echo 1 || echo 0) -endef - -# x86_64 architecture detection -ifeq ($(ARCH),x86_64) - -# Host CPU feature detection for x86_64 +# Native compilation +ifeq ($(CROSS_PREFIX),) ifeq ($(HOST_PLATFORM),Linux-x86_64) -# Linux: Use /proc/cpuinfo -MK_HOST_SUPPORTS_AVX2 := $(call check_host_feature,avx2,cat /proc/cpuinfo) -MK_HOST_SUPPORTS_SSE2 := $(call check_host_feature,sse2,cat /proc/cpuinfo) -MK_HOST_SUPPORTS_BMI2 := $(call check_host_feature,bmi2,cat /proc/cpuinfo) -else ifeq ($(HOST_PLATFORM),Darwin-x86_64) -# macOS: Use sysctl -MK_HOST_SUPPORTS_AVX2 := $(call check_host_feature,AVX2,sysctl -n machdep.cpu.leaf7_features) -MK_HOST_SUPPORTS_SSE2 := $(call check_host_feature,SSE2,sysctl -n machdep.cpu.features) -MK_HOST_SUPPORTS_BMI2 := $(call check_host_feature,BMI2,sysctl -n machdep.cpu.leaf7_features) -else ifneq ($(CROSS_PREFIX),) -# Cross-compilation: assume all features are supported -MK_HOST_SUPPORTS_AVX2 := 1 -MK_HOST_SUPPORTS_SSE2 := 1 -MK_HOST_SUPPORTS_BMI2 := 1 -else -# Other platforms: assume no support -MK_HOST_SUPPORTS_AVX2 := 0 -MK_HOST_SUPPORTS_SSE2 := 0 -MK_HOST_SUPPORTS_BMI2 := 0 -endif # HOST_PLATFORM x86_64 - -endif # x86_64 - -# AArch64 architecture detection -ifeq ($(ARCH),aarch64) - -# Host CPU feature detection for AArch64 -ifeq ($(HOST_PLATFORM),Linux-aarch64) -# Linux: Use /proc/cpuinfo (look for sha3 in Features line) -MK_HOST_SUPPORTS_SHA3 := $(call check_host_feature,sha3,cat /proc/cpuinfo) + CFLAGS += -mavx2 -mbmi2 -mpopcnt -maes + CFLAGS += -DMLK_FORCE_X86_64 +else ifeq ($(HOST_PLATFORM),Linux-aarch64) + CFLAGS += -DMLK_FORCE_AARCH64 else ifeq ($(HOST_PLATFORM),Darwin-arm64) -# macOS: Use sysctl to check for SHA3 support -MK_HOST_SUPPORTS_SHA3 := $(call check_host_feature,1,sysctl -n hw.optional.armv8_2_sha3) -else ifneq ($(CROSS_PREFIX),) -# Cross-compilation: assume all features are supported -MK_HOST_SUPPORTS_SHA3 := 1 -else -# Other platforms: assume no support -MK_HOST_SUPPORTS_SHA3 := 0 -endif # HOST_PLATFORM aarch64 - -endif # aarch64 - -# Only apply CFLAGS modifications if AUTO=1 -ifeq ($(AUTO),1) - -# x86_64 CFLAGS configuration -ifeq ($(ARCH),x86_64) -CFLAGS += -DMLK_FORCE_X86_64 - -# Add flags only if both compiler and host support the feature -ifeq ($(MK_COMPILER_SUPPORTS_AVX2)$(MK_HOST_SUPPORTS_AVX2),11) -CFLAGS += -mavx2 + CFLAGS += -DMLK_FORCE_AARCH64 +else ifeq ($(HOST_PLATFORM),Linux-ppc64le) + CFLAGS += -DMLK_FORCE_PPC64LE endif - -ifeq ($(MK_COMPILER_SUPPORTS_BMI2)$(MK_HOST_SUPPORTS_BMI2),11) -CFLAGS += -mbmi2 +# Cross compilation +else ifneq ($(findstring x86_64, $(CROSS_PREFIX)),) + CFLAGS += -mavx2 -mbmi2 -mpopcnt -maes + CFLAGS += -DMLK_FORCE_X86_64 +else ifneq ($(findstring aarch64_be, $(CROSS_PREFIX)),) + CFLAGS += -DMLK_FORCE_AARCH64_EB +else ifneq ($(findstring aarch64, $(CROSS_PREFIX)),) + CFLAGS += -DMLK_FORCE_AARCH64 +else ifneq ($(findstring riscv64, $(CROSS_PREFIX)),) + CFLAGS += -DMLK_FORCE_RISCV64 +else ifneq ($(findstring riscv32, $(CROSS_PREFIX)),) + CFLAGS += -DMLK_FORCE_RISCV32 +else ifneq ($(findstring powerpc64le, $(CROSS_PREFIX)),) + CFLAGS += -DMLK_FORCE_PPC64LE +else ifneq ($(findstring ppc64le, $(CROSS_PREFIX)),) + CFLAGS += -DMLK_FORCE_PPC64LE endif -endif # x86_64 - -# AArch64 CFLAGS configuration -ifeq ($(ARCH),aarch64) -CFLAGS += -DMLK_FORCE_AARCH64 - -# Add SHA3 flags only if both compiler and host support it -ifeq ($(MK_COMPILER_SUPPORTS_SHA3)$(MK_HOST_SUPPORTS_SHA3),11) -CFLAGS += -march=armv8.4-a+sha3 -endif -endif # aarch64 - -# AArch64 Big Endian CFLAGS configuration -ifeq ($(ARCH),aarch64_be) -CFLAGS += -DMLK_FORCE_AARCH64_EB -endif # aarch64_be - -# RISC-V 64-bit CFLAGS configuration -ifeq ($(ARCH),riscv64) -CFLAGS += -DMLK_FORCE_RISCV64 -endif # riscv64 - -# RISC-V 32-bit CFLAGS configuration -ifeq ($(ARCH),riscv32) -CFLAGS += -DMLK_FORCE_RISCV32 -endif # riscv32 - -# PowerPC 64-bit Little Endian CFLAGS configuration -ifeq ($(ARCH),powerpc64le) -CFLAGS += -DMLK_FORCE_PPC64LE -endif # powerpc64le - -endif # AUTO=1 - -endif # _AUTO_MK diff --git a/test/mk/components.mk b/test/mk/components.mk index fabe5b412..77f9f3212 100644 --- a/test/mk/components.mk +++ b/test/mk/components.mk @@ -8,10 +8,11 @@ endif SOURCES += $(wildcard mlkem/src/*.c) ifeq ($(OPT),1) SOURCES += $(wildcard mlkem/src/native/aarch64/src/*.[csS]) $(wildcard mlkem/src/native/x86_64/src/*.[csS]) + SOURCES += $(wildcard mlkem/src/native/ppc64le/src/*.[csS]) CFLAGS += -DMLK_CONFIG_USE_NATIVE_BACKEND_ARITH -DMLK_CONFIG_USE_NATIVE_BACKEND_FIPS202 endif -ALL_TESTS = test_mlkem acvp_mlkem bench_mlkem bench_components_mlkem gen_KAT test_stack +ALL_TESTS = test_mlkem acvp_mlkem bench_mlkem bench_components_mlkem gen_KAT MLKEM512_DIR = $(BUILD_DIR)/mlkem512 MLKEM768_DIR = $(BUILD_DIR)/mlkem768 @@ -24,9 +25,6 @@ $(MLKEM768_OBJS): CFLAGS += -DMLK_CONFIG_PARAMETER_SET=768 MLKEM1024_OBJS = $(call MAKE_OBJS,$(MLKEM1024_DIR),$(SOURCES) $(FIPS202_SRCS)) $(MLKEM1024_OBJS): CFLAGS += -DMLK_CONFIG_PARAMETER_SET=1024 - - - $(BUILD_DIR)/libmlkem512.a: $(MLKEM512_OBJS) $(BUILD_DIR)/libmlkem768.a: $(MLKEM768_OBJS) $(BUILD_DIR)/libmlkem1024.a: $(MLKEM1024_OBJS) @@ -40,10 +38,6 @@ $(MLKEM512_DIR)/bin/bench_components_mlkem512: CFLAGS += -Itest/hal $(MLKEM768_DIR)/bin/bench_components_mlkem768: CFLAGS += -Itest/hal $(MLKEM1024_DIR)/bin/bench_components_mlkem1024: CFLAGS += -Itest/hal -$(MLKEM512_DIR)/bin/test_stack512: CFLAGS += -Imlkem/src -fstack-usage -$(MLKEM768_DIR)/bin/test_stack768: CFLAGS += -Imlkem/src -fstack-usage -$(MLKEM1024_DIR)/bin/test_stack1024: CFLAGS += -Imlkem/src -fstack-usage - $(MLKEM512_DIR)/bin/bench_mlkem512: $(MLKEM512_DIR)/test/hal/hal.c.o $(MLKEM768_DIR)/bin/bench_mlkem768: $(MLKEM768_DIR)/test/hal/hal.c.o $(MLKEM1024_DIR)/bin/bench_mlkem1024: $(MLKEM1024_DIR)/test/hal/hal.c.o From 68ee31ca5b07faf45967db681b516b0ff53243bf Mon Sep 17 00:00:00 2001 From: "Matthias J. Kannwischer" Date: Mon, 8 Sep 2025 09:41:46 +0800 Subject: [PATCH 02/16] Document nix 2.18 requirement; disable nix setup test for nix 2.6 Since August 2025, nixpkgs requires a nix version of at least nix 2.18. Consequently, our nix setup tests using nix 2.6.1 and Ubuntu 22 (nix 2.6.0) break. This comment documents that at least nix 2.18 is required, updates the nix test to 2.18.0, and (temporarily) disables the Ubuntu 22 test. Signed-off-by: Matthias J. Kannwischer Signed-off-by: Danny Tsen --- .github/workflows/nix.yml | 11 +++++++---- CONTRIBUTING.md | 2 +- 2 files changed, 8 insertions(+), 5 deletions(-) diff --git a/.github/workflows/nix.yml b/.github/workflows/nix.yml index 3de61b9ea..baba54013 100644 --- a/.github/workflows/nix.yml +++ b/.github/workflows/nix.yml @@ -86,12 +86,15 @@ jobs: fail-fast: false matrix: target: + # nixpkgs requires 2.18 since August 2025, see + # https://github.com/NixOS/nixpkgs/pull/428076 + # TODO: Re-enable tests on Ubuntu 22 once nix has been updated to >= 2.18 + # - runner: ubuntu-22.04 + # container: + # install: 'apt' - runner: ubuntu-latest - container: nixos/nix:2.6.1 + container: nixos/nix:2.18.0 install: 'native' - - runner: ubuntu-22.04 - container: - install: 'apt' - runner: ubuntu-24.04 container: install: 'apt' diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index faed98743..153014738 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -9,7 +9,7 @@ any of the open issues. Here are some things to get you started. We specify the development environment for mlkem-native using `nix`. If you want to help develop mlkem-native, please use `nix`. We recommend using the latest Nix version provided by the [nix installer -script](https://nixos.org/download/), but we currently support all Nix versions >= 2.6. +script](https://nixos.org/download/), but we currently support all Nix versions >= 2.18. All the development and build dependencies are specified in [flake.nix](flake.nix). To execute a bash shell, run ```bash From 1f41a2eb54396207c74310d2af02005d898d62ce Mon Sep 17 00:00:00 2001 From: "Matthias J. Kannwischer" Date: Sat, 2 Aug 2025 20:47:50 +0800 Subject: [PATCH 03/16] Add clang_21 to compiler and constant-time tests Signed-off-by: Matthias J. Kannwischer Signed-off-by: Danny Tsen --- .github/workflows/ci.yml | 7 +++++++ .github/workflows/ct-tests.yml | 3 ++- flake.lock | 6 +++--- flake.nix | 3 +++ 4 files changed, 15 insertions(+), 4 deletions(-) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 12aeff977..2e1ee84e3 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -257,6 +257,13 @@ jobs: c23: True opt: all examples: true + - name: clang-21 + shell: ci_clang21 + darwin: True + c17: True + c23: True + opt: all + examples: true # CPU flags are not correctly passed to the zig assembler # https://github.com/ziglang/zig/issues/23576 # We therefore only test the C backend diff --git a/.github/workflows/ct-tests.yml b/.github/workflows/ct-tests.yml index 24bde4b2c..0789e5bdb 100644 --- a/.github/workflows/ct-tests.yml +++ b/.github/workflows/ct-tests.yml @@ -26,6 +26,7 @@ jobs: - ci_valgrind-varlat_clang18 - ci_valgrind-varlat_clang19 - ci_valgrind-varlat_clang20 + - ci_valgrind-varlat_clang21 - ci_valgrind-varlat_gcc48 - ci_valgrind-varlat_gcc49 - ci_valgrind-varlat_gcc7 @@ -62,7 +63,7 @@ jobs: valgrind_flags: --variable-latency-errors=yes - name: Build and run test (-Ofast) # -Ofast got deprecated in clang19; -O3 -ffast-math should be used instead - if: ${{ matrix.nix-shell != 'ci_valgrind-varlat_clang19' && matrix.nix-shell != 'ci_valgrind-varlat_clang20' }} + if: ${{ matrix.nix-shell != 'ci_valgrind-varlat_clang19' && matrix.nix-shell != 'ci_valgrind-varlat_clang20' && matrix.nix-shell != 'ci_valgrind-varlat_clang21'}} uses: ./.github/actions/ct-test with: cflags: -Ofast -DMLK_CONFIG_KEYGEN_PCT diff --git a/flake.lock b/flake.lock index b9f3f45e2..a4ac8a8d1 100644 --- a/flake.lock +++ b/flake.lock @@ -54,11 +54,11 @@ }, "nixpkgs-unstable": { "locked": { - "lastModified": 1753939845, - "narHash": "sha256-K2ViRJfdVGE8tpJejs8Qpvvejks1+A4GQej/lBk5y7I=", + "lastModified": 1757068644, + "narHash": "sha256-NOrUtIhTkIIumj1E/Rsv1J37Yi3xGStISEo8tZm3KW4=", "owner": "NixOS", "repo": "nixpkgs", - "rev": "94def634a20494ee057c76998843c015909d6311", + "rev": "8eb28adfa3dc4de28e792e3bf49fcf9007ca8ac9", "type": "github" }, "original": { diff --git a/flake.nix b/flake.nix index bcb9252dd..625fe2f9e 100644 --- a/flake.nix +++ b/flake.nix @@ -50,6 +50,7 @@ gcc48 = pkgs-2405.gcc48; gcc49 = pkgs-2405.gcc49; gcc7 = pkgs-2405.gcc7; + clang_21 = pkgs-unstable.clang_21; }) ]; }; @@ -128,6 +129,7 @@ devShells.ci_clang18 = util.mkShellWithCC' pkgs.clang_18; devShells.ci_clang19 = util.mkShellWithCC' pkgs.clang_19; devShells.ci_clang20 = util.mkShellWithCC' pkgs.clang_20; + devShells.ci_clang21 = util.mkShellWithCC' pkgs.clang_21; devShells.ci_zig0_12 = util.mkShellWithCC' (zigWrapCC pkgs.zig_0_12); devShells.ci_zig0_13 = util.mkShellWithCC' (zigWrapCC pkgs.zig_0_13); @@ -150,6 +152,7 @@ devShells.ci_valgrind-varlat_clang18 = util.mkShellWithCC_valgrind' pkgs.clang_18; devShells.ci_valgrind-varlat_clang19 = util.mkShellWithCC_valgrind' pkgs.clang_19; devShells.ci_valgrind-varlat_clang20 = util.mkShellWithCC_valgrind' pkgs.clang_20; + devShells.ci_valgrind-varlat_clang21 = util.mkShellWithCC_valgrind' pkgs.clang_21; devShells.ci_valgrind-varlat_gcc48 = util.mkShellWithCC_valgrind' pkgs.gcc48; devShells.ci_valgrind-varlat_gcc49 = util.mkShellWithCC_valgrind' pkgs.gcc49; devShells.ci_valgrind-varlat_gcc7 = util.mkShellWithCC_valgrind' pkgs.gcc7; From d4e8c286e654082d3b5e2c0a2bea1e288bf29b59 Mon Sep 17 00:00:00 2001 From: willieyz Date: Tue, 12 Aug 2025 18:22:07 +0800 Subject: [PATCH 04/16] bitwuzla: update the bitwuzla version from 0.7.0 to 0.8.2 in nix Signed-off-by: willieyz Signed-off-by: Danny Tsen --- flake.nix | 4 ++-- nix/cbmc/default.nix | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/flake.nix b/flake.nix index 625fe2f9e..8166f5108 100644 --- a/flake.nix +++ b/flake.nix @@ -25,7 +25,7 @@ util = pkgs.callPackage ./nix/util.nix { # Keep those around in case we want to switch to unstable versions cbmc = pkgs-unstable.cbmc; - bitwuzla = pkgs.bitwuzla; + bitwuzla = pkgs-unstable.bitwuzla; z3 = pkgs.z3; }; zigWrapCC = zig: pkgs.symlinkJoin { @@ -170,7 +170,7 @@ util = pkgs.callPackage ./nix/util.nix { inherit pkgs; cbmc = pkgs-unstable.cbmc; - bitwuzla = pkgs.bitwuzla; + bitwuzla = pkgs-unstable.bitwuzla; z3 = pkgs.z3; }; in diff --git a/nix/cbmc/default.nix b/nix/cbmc/default.nix index d9a602284..4fd886b6a 100644 --- a/nix/cbmc/default.nix +++ b/nix/cbmc/default.nix @@ -62,7 +62,7 @@ buildEnv { inherit cadical#2.1.3 - bitwuzla# 0.7.0 + bitwuzla# 0.8.2 ninja; # 1.12.1 }; } From 4e33b285c086e1a9a8346092cd2f9f97f65f8173 Mon Sep 17 00:00:00 2001 From: "Matthias J. Kannwischer" Date: Tue, 9 Sep 2025 18:35:34 +0800 Subject: [PATCH 05/16] Add compiler test for zig 0.15 https://github.com/ziglang/zig/releases/tag/0.15.1 Signed-off-by: Matthias J. Kannwischer Signed-off-by: Danny Tsen --- .github/workflows/ci.yml | 7 +++++++ flake.nix | 2 ++ 2 files changed, 9 insertions(+) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 2e1ee84e3..9f6f6fd39 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -291,6 +291,13 @@ jobs: c23: True examples: False opt: no_opt + - name: zig-0.15 + shell: ci_zig0_15 + darwin: True + c17: True + c23: True + examples: False + opt: no_opt runs-on: ${{ matrix.target.runner }} steps: - uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8 # v5.0.0 diff --git a/flake.nix b/flake.nix index 8166f5108..9e63a64f5 100644 --- a/flake.nix +++ b/flake.nix @@ -51,6 +51,7 @@ gcc49 = pkgs-2405.gcc49; gcc7 = pkgs-2405.gcc7; clang_21 = pkgs-unstable.clang_21; + zig_0_15 = pkgs-unstable.zig_0_15; }) ]; }; @@ -134,6 +135,7 @@ devShells.ci_zig0_12 = util.mkShellWithCC' (zigWrapCC pkgs.zig_0_12); devShells.ci_zig0_13 = util.mkShellWithCC' (zigWrapCC pkgs.zig_0_13); devShells.ci_zig0_14 = util.mkShellWithCC' (zigWrapCC pkgs.zig); + devShells.ci_zig0_15 = util.mkShellWithCC' (zigWrapCC pkgs.zig_0_15); devShells.ci_gcc48 = util.mkShellWithCC' pkgs.gcc48; devShells.ci_gcc49 = util.mkShellWithCC' pkgs.gcc49; From aa3b87b67580e44a8e1671a80a8f058cc9c92c6a Mon Sep 17 00:00:00 2001 From: Danny Tsen Date: Tue, 9 Sep 2025 13:35:04 -0400 Subject: [PATCH 06/16] Fixed auto.mk, components.mk and YML files. Signed-off-by: Danny Tsen --- integration/liboqs/ML-KEM-1024_META.yml | 154 ++++++++++++++---------- integration/liboqs/ML-KEM-512_META.yml | 154 ++++++++++++++---------- integration/liboqs/ML-KEM-768_META.yml | 154 ++++++++++++++---------- test/mk/auto.mk | 132 ++++++++++++++++---- test/mk/components.mk | 9 +- 5 files changed, 378 insertions(+), 225 deletions(-) diff --git a/integration/liboqs/ML-KEM-1024_META.yml b/integration/liboqs/ML-KEM-1024_META.yml index 766c936e2..c3ffce4e6 100644 --- a/integration/liboqs/ML-KEM-1024_META.yml +++ b/integration/liboqs/ML-KEM-1024_META.yml @@ -9,74 +9,96 @@ length-ciphertext: 1568 length-secret-key: 3168 length-shared-secret: 32 length-keypair-seed: 64 +length-encaps-seed: 32 nistkat-sha256: f580d851e5fb27e6876e5e203fa18be4cdbfd49e05d48fec3d3992c8f43a13e6 testvectors-sha256: ff1a854b9b6761a70c65ccae85246fe0596a949e72eae0866a8a2a2d4ea54b10 principal-submitters: - - Peter Schwabe +- Peter Schwabe auxiliary-submitters: - - Roberto Avanzi - - Joppe Bos - - Léo Ducas - - Eike Kiltz - - Tancrède Lepoint - - Vadim Lyubashevsky - - John M. Schanck - - Gregor Seiler - - Damien Stehlé +- Roberto Avanzi +- Joppe Bos +- Léo Ducas +- Eike Kiltz +- Tancrède Lepoint +- Vadim Lyubashevsky +- John M. Schanck +- Gregor Seiler +- Damien Stehlé implementations: - - name: ref - version: FIPS203 - folder_name: . - compile_opts: -DMLK_CONFIG_PARAMETER_SET=1024 -DMLK_CONFIG_FILE="../../integration/liboqs/config_c.h" - signature_keypair: PQCP_MLKEM_NATIVE_MLKEM1024_C_keypair - signature_keypair_derand: PQCP_MLKEM_NATIVE_MLKEM1024_C_keypair_derand - signature_enc: PQCP_MLKEM_NATIVE_MLKEM1024_C_enc - signature_dec: PQCP_MLKEM_NATIVE_MLKEM1024_C_dec - sources: integration/liboqs/config_c.h integration/liboqs/fips202_glue.h integration/liboqs/fips202x4_glue.h mlkem/src/cbmc.h mlkem/src/common.h mlkem/src/compress.c mlkem/src/compress.h mlkem/src/debug.c mlkem/src/debug.h mlkem/src/indcpa.c mlkem/src/indcpa.h mlkem/src/kem.c mlkem/src/kem.h mlkem/src/native/api.h mlkem/src/native/meta.h mlkem/src/params.h mlkem/src/poly.c mlkem/src/poly.h mlkem/src/poly_k.c mlkem/src/poly_k.h mlkem/src/randombytes.h mlkem/src/sampling.c mlkem/src/sampling.h mlkem/src/symmetric.h mlkem/src/sys.h mlkem/src/verify.c mlkem/src/verify.h mlkem/src/zetas.inc - - name: x86_64 - version: FIPS203 - folder_name: . - compile_opts: -DMLK_CONFIG_PARAMETER_SET=1024 -DMLK_CONFIG_FILE="../../integration/liboqs/config_x86_64.h" - signature_keypair: PQCP_MLKEM_NATIVE_MLKEM1024_X86_64_keypair - signature_keypair_derand: PQCP_MLKEM_NATIVE_MLKEM1024_X86_64_keypair_derand - signature_enc: PQCP_MLKEM_NATIVE_MLKEM1024_X86_64_enc - signature_dec: PQCP_MLKEM_NATIVE_MLKEM1024_X86_64_dec - sources: integration/liboqs/config_x86_64.h integration/liboqs/fips202_glue.h integration/liboqs/fips202x4_glue.h mlkem/src/cbmc.h mlkem/src/common.h mlkem/src/compress.c mlkem/src/compress.h mlkem/src/debug.c mlkem/src/debug.h mlkem/src/indcpa.c mlkem/src/indcpa.h mlkem/src/kem.c mlkem/src/kem.h mlkem/src/native/api.h mlkem/src/native/meta.h mlkem/src/native/x86_64 mlkem/src/params.h mlkem/src/poly.c mlkem/src/poly.h mlkem/src/poly_k.c mlkem/src/poly_k.h mlkem/src/randombytes.h mlkem/src/sampling.c mlkem/src/sampling.h mlkem/src/symmetric.h mlkem/src/sys.h mlkem/src/verify.c mlkem/src/verify.h mlkem/src/zetas.inc - supported_platforms: - - architecture: x86_64 - operating_systems: - - Linux - - Darwin - required_flags: - - avx2 - - bmi2 - - popcnt - - name: aarch64 - version: FIPS203 - folder_name: . - compile_opts: -DMLK_CONFIG_PARAMETER_SET=1024 -DMLK_CONFIG_FILE="../../integration/liboqs/config_aarch64.h" - signature_keypair: PQCP_MLKEM_NATIVE_MLKEM1024_AARCH64_keypair - signature_keypair_derand: PQCP_MLKEM_NATIVE_MLKEM1024_AARCH64_keypair_derand - signature_enc: PQCP_MLKEM_NATIVE_MLKEM1024_AARCH64_enc - signature_dec: PQCP_MLKEM_NATIVE_MLKEM1024_AARCH64_dec - sources: integration/liboqs/config_aarch64.h integration/liboqs/fips202_glue.h integration/liboqs/fips202x4_glue.h mlkem/src/cbmc.h mlkem/src/common.h mlkem/src/compress.c mlkem/src/compress.h mlkem/src/debug.c mlkem/src/debug.h mlkem/src/indcpa.c mlkem/src/indcpa.h mlkem/src/kem.c mlkem/src/kem.h mlkem/src/native/api.h mlkem/src/native/meta.h mlkem/src/native/aarch64 mlkem/src/params.h mlkem/src/poly.c mlkem/src/poly.h mlkem/src/poly_k.c mlkem/src/poly_k.h mlkem/src/randombytes.h mlkem/src/sampling.c mlkem/src/sampling.h mlkem/src/symmetric.h mlkem/src/sys.h mlkem/src/verify.c mlkem/src/verify.h mlkem/src/zetas.inc - supported_platforms: - - architecture: arm_8 - operating_systems: - - Linux - - Darwin - required_flags: - - asimd - - name: ppc64le - version: FIPS203 - folder_name: . - compile_opts: -DMLK_CONFIG_PARAMETER_SET=1024 -DMLK_CONFIG_FILE="../../integration/liboqs/config_ppc64le.h" - signature_keypair: PQCP_MLKEM_NATIVE_MLKEM1024_PPC64LE_keypair - signature_keypair_derand: PQCP_MLKEM_NATIVE_MLKEM1024_PPC64LE_keypair_derand - signature_enc: PQCP_MLKEM_NATIVE_MLKEM1024_PPC64LE_enc - signature_dec: PQCP_MLKEM_NATIVE_MLKEM1024_PPC64LE_dec - sources: integration/liboqs/config_ppc64le.h mlkem/src/cbmc.h mlkem/src/common.h mlkem/src/compress.c mlkem/src/compress.h mlkem/src/debug.c mlkem/src/debug.h mlkem/src/indcpa.c mlkem/src/indcpa.h mlkem/src/kem.c mlkem/src/kem.h mlkem/src/native/api.h mlkem/src/native/meta.h mlkem/src/native/aarch64 mlkem/src/params.h mlkem/src/poly.c mlkem/src/poly.h mlkem/src/poly_k.c mlkem/src/poly_k.h mlkem/src/randombytes.h mlkem/src/sampling.c mlkem/src/sampling.h mlkem/src/symmetric.h mlkem/src/sys.h mlkem/src/verify.c mlkem/src/verify.h mlkem/src/zetas.inc - supported_platforms: - - architecture: ppc64le - operating_systems: - - Linux +- name: ref + version: FIPS203 + folder_name: . + compile_opts: -DMLK_CONFIG_PARAMETER_SET=1024 -DMLK_CONFIG_FILE="../../integration/liboqs/config_c.h" + signature_keypair: PQCP_MLKEM_NATIVE_MLKEM1024_C_keypair + signature_keypair_derand: PQCP_MLKEM_NATIVE_MLKEM1024_C_keypair_derand + signature_enc: PQCP_MLKEM_NATIVE_MLKEM1024_C_enc + signature_enc_derand: PQCP_MLKEM_NATIVE_MLKEM1024_C_enc_derand + signature_dec: PQCP_MLKEM_NATIVE_MLKEM1024_C_dec + sources: integration/liboqs/config_c.h integration/liboqs/fips202_glue.h integration/liboqs/fips202x4_glue.h + mlkem/src/cbmc.h mlkem/src/common.h mlkem/src/compress.c mlkem/src/compress.h + mlkem/src/debug.c mlkem/src/debug.h mlkem/src/indcpa.c mlkem/src/indcpa.h mlkem/src/kem.c + mlkem/src/kem.h mlkem/src/native/api.h mlkem/src/native/meta.h mlkem/src/params.h + mlkem/src/poly.c mlkem/src/poly.h mlkem/src/poly_k.c mlkem/src/poly_k.h mlkem/src/randombytes.h + mlkem/src/sampling.c mlkem/src/sampling.h mlkem/src/symmetric.h mlkem/src/sys.h + mlkem/src/verify.c mlkem/src/verify.h mlkem/src/zetas.inc +- name: x86_64 + version: FIPS203 + folder_name: . + compile_opts: -DMLK_CONFIG_PARAMETER_SET=1024 -DMLK_CONFIG_FILE="../../integration/liboqs/config_x86_64.h" + signature_keypair: PQCP_MLKEM_NATIVE_MLKEM1024_X86_64_keypair + signature_keypair_derand: PQCP_MLKEM_NATIVE_MLKEM1024_X86_64_keypair_derand + signature_enc: PQCP_MLKEM_NATIVE_MLKEM1024_X86_64_enc + signature_enc_derand: PQCP_MLKEM_NATIVE_MLKEM1024_X86_64_enc_derand + signature_dec: PQCP_MLKEM_NATIVE_MLKEM1024_X86_64_dec + sources: integration/liboqs/config_x86_64.h integration/liboqs/fips202_glue.h integration/liboqs/fips202x4_glue.h + mlkem/src/cbmc.h mlkem/src/common.h mlkem/src/compress.c mlkem/src/compress.h + mlkem/src/debug.c mlkem/src/debug.h mlkem/src/indcpa.c mlkem/src/indcpa.h mlkem/src/kem.c + mlkem/src/kem.h mlkem/src/native/api.h mlkem/src/native/meta.h mlkem/src/params.h + mlkem/src/poly.c mlkem/src/poly.h mlkem/src/poly_k.c mlkem/src/poly_k.h mlkem/src/randombytes.h + mlkem/src/sampling.c mlkem/src/sampling.h mlkem/src/symmetric.h mlkem/src/sys.h + mlkem/src/verify.c mlkem/src/verify.h mlkem/src/zetas.inc mlkem/src/native/x86_64 + supported_platforms: + - architecture: x86_64 + operating_systems: + - Linux + - Darwin + required_flags: + - avx2 + - bmi2 + - popcnt +- name: aarch64 + version: FIPS203 + folder_name: . + compile_opts: -DMLK_CONFIG_PARAMETER_SET=1024 -DMLK_CONFIG_FILE="../../integration/liboqs/config_aarch64.h" + signature_keypair: PQCP_MLKEM_NATIVE_MLKEM1024_AARCH64_keypair + signature_keypair_derand: PQCP_MLKEM_NATIVE_MLKEM1024_AARCH64_keypair_derand + signature_enc: PQCP_MLKEM_NATIVE_MLKEM1024_AARCH64_enc + signature_enc_derand: PQCP_MLKEM_NATIVE_MLKEM1024_AARCH64_enc_derand + signature_dec: PQCP_MLKEM_NATIVE_MLKEM1024_AARCH64_dec + sources: integration/liboqs/config_aarch64.h integration/liboqs/fips202_glue.h integration/liboqs/fips202x4_glue.h + mlkem/src/cbmc.h mlkem/src/common.h mlkem/src/compress.c mlkem/src/compress.h + mlkem/src/debug.c mlkem/src/debug.h mlkem/src/indcpa.c mlkem/src/indcpa.h mlkem/src/kem.c + mlkem/src/kem.h mlkem/src/native/api.h mlkem/src/native/meta.h mlkem/src/params.h + mlkem/src/poly.c mlkem/src/poly.h mlkem/src/poly_k.c mlkem/src/poly_k.h mlkem/src/randombytes.h + mlkem/src/sampling.c mlkem/src/sampling.h mlkem/src/symmetric.h mlkem/src/sys.h + mlkem/src/verify.c mlkem/src/verify.h mlkem/src/zetas.inc mlkem/src/native/aarch64 + supported_platforms: + - architecture: arm_8 + operating_systems: + - Linux + - Darwin + required_flags: + - asimd +- name: ppc64le + version: FIPS203 + folder_name: . + compile_opts: -DMLK_CONFIG_PARAMETER_SET=1024 -DMLK_CONFIG_FILE="../../integration/liboqs/config_ppc64le.h" + signature_keypair: PQCP_MLKEM_NATIVE_MLKEM1024_PPC64LE_keypair + signature_keypair_derand: PQCP_MLKEM_NATIVE_MLKEM1024_PPC64LE_keypair_derand + signature_enc: PQCP_MLKEM_NATIVE_MLKEM1024_PPC64LE_enc + signature_dec: PQCP_MLKEM_NATIVE_MLKEM1024_PPC64LE_dec + sources: integration/liboqs/config_ppc64le.h mlkem/src/cbmc.h mlkem/src/common.h mlkem/src/compress.c mlkem/src/compress.h mlkem/src/debug.c mlkem/src/debug.h mlkem/src/indcpa.c mlkem/src/indcpa.h mlkem/src/kem.c mlkem/src/kem.h mlkem/src/native/api.h mlkem/src/native/meta.h mlkem/src/native/aarch64 mlkem/src/params.h mlkem/src/poly.c mlkem/src/poly.h mlkem/src/poly_k.c mlkem/src/poly_k.h mlkem/src/randombytes.h mlkem/src/sampling.c mlkem/src/sampling.h mlkem/src/symmetric.h mlkem/src/sys.h mlkem/src/verify.c mlkem/src/verify.h mlkem/src/zetas.inc + supported_platforms: + - architecture: ppc64le + operating_systems: + - Linux diff --git a/integration/liboqs/ML-KEM-512_META.yml b/integration/liboqs/ML-KEM-512_META.yml index 9d2c7633a..c5fb05e60 100644 --- a/integration/liboqs/ML-KEM-512_META.yml +++ b/integration/liboqs/ML-KEM-512_META.yml @@ -9,74 +9,96 @@ length-ciphertext: 768 length-secret-key: 1632 length-shared-secret: 32 length-keypair-seed: 64 +length-encaps-seed: 32 nistkat-sha256: c70041a761e01cd6426fa60e9fd6a4412c2be817386c8d0f3334898082512782 testvectors-sha256: 6730bb552c22d9d2176ffb5568e48eb30952cf1f065073ec5f9724f6a3c6ea85 principal-submitters: - - Peter Schwabe +- Peter Schwabe auxiliary-submitters: - - Roberto Avanzi - - Joppe Bos - - Léo Ducas - - Eike Kiltz - - Tancrède Lepoint - - Vadim Lyubashevsky - - John M. Schanck - - Gregor Seiler - - Damien Stehlé +- Roberto Avanzi +- Joppe Bos +- Léo Ducas +- Eike Kiltz +- Tancrède Lepoint +- Vadim Lyubashevsky +- John M. Schanck +- Gregor Seiler +- Damien Stehlé implementations: - - name: ref - version: FIPS203 - folder_name: . - compile_opts: -DMLK_CONFIG_PARAMETER_SET=512 -DMLK_CONFIG_FILE="../../integration/liboqs/config_c.h" - signature_keypair: PQCP_MLKEM_NATIVE_MLKEM512_C_keypair - signature_keypair_derand: PQCP_MLKEM_NATIVE_MLKEM512_C_keypair_derand - signature_enc: PQCP_MLKEM_NATIVE_MLKEM512_C_enc - signature_dec: PQCP_MLKEM_NATIVE_MLKEM512_C_dec - sources: integration/liboqs/config_c.h integration/liboqs/fips202_glue.h integration/liboqs/fips202x4_glue.h mlkem/src/cbmc.h mlkem/src/common.h mlkem/src/compress.c mlkem/src/compress.h mlkem/src/debug.c mlkem/src/debug.h mlkem/src/indcpa.c mlkem/src/indcpa.h mlkem/src/kem.c mlkem/src/kem.h mlkem/src/native/api.h mlkem/src/native/meta.h mlkem/src/params.h mlkem/src/poly.c mlkem/src/poly.h mlkem/src/poly_k.c mlkem/src/poly_k.h mlkem/src/randombytes.h mlkem/src/sampling.c mlkem/src/sampling.h mlkem/src/symmetric.h mlkem/src/sys.h mlkem/src/verify.c mlkem/src/verify.h mlkem/src/zetas.inc - - name: x86_64 - version: FIPS203 - folder_name: . - compile_opts: -DMLK_CONFIG_PARAMETER_SET=512 -DMLK_CONFIG_FILE="../../integration/liboqs/config_x86_64.h" - signature_keypair: PQCP_MLKEM_NATIVE_MLKEM512_X86_64_keypair - signature_keypair_derand: PQCP_MLKEM_NATIVE_MLKEM512_X86_64_keypair_derand - signature_enc: PQCP_MLKEM_NATIVE_MLKEM512_X86_64_enc - signature_dec: PQCP_MLKEM_NATIVE_MLKEM512_X86_64_dec - sources: integration/liboqs/config_x86_64.h integration/liboqs/fips202_glue.h integration/liboqs/fips202x4_glue.h mlkem/src/cbmc.h mlkem/src/common.h mlkem/src/compress.c mlkem/src/compress.h mlkem/src/debug.c mlkem/src/debug.h mlkem/src/indcpa.c mlkem/src/indcpa.h mlkem/src/kem.c mlkem/src/kem.h mlkem/src/native/api.h mlkem/src/native/meta.h mlkem/src/native/x86_64 mlkem/src/params.h mlkem/src/poly.c mlkem/src/poly.h mlkem/src/poly_k.c mlkem/src/poly_k.h mlkem/src/randombytes.h mlkem/src/sampling.c mlkem/src/sampling.h mlkem/src/symmetric.h mlkem/src/sys.h mlkem/src/verify.c mlkem/src/verify.h mlkem/src/zetas.inc - supported_platforms: - - architecture: x86_64 - operating_systems: - - Linux - - Darwin - required_flags: - - avx2 - - bmi2 - - popcnt - - name: aarch64 - version: FIPS203 - folder_name: . - compile_opts: -DMLK_CONFIG_PARAMETER_SET=512 -DMLK_CONFIG_FILE="../../integration/liboqs/config_aarch64.h" - signature_keypair: PQCP_MLKEM_NATIVE_MLKEM512_AARCH64_keypair - signature_keypair_derand: PQCP_MLKEM_NATIVE_MLKEM512_AARCH64_keypair_derand - signature_enc: PQCP_MLKEM_NATIVE_MLKEM512_AARCH64_enc - signature_dec: PQCP_MLKEM_NATIVE_MLKEM512_AARCH64_dec - sources: integration/liboqs/config_aarch64.h integration/liboqs/fips202_glue.h integration/liboqs/fips202x4_glue.h mlkem/src/cbmc.h mlkem/src/common.h mlkem/src/compress.c mlkem/src/compress.h mlkem/src/debug.c mlkem/src/debug.h mlkem/src/indcpa.c mlkem/src/indcpa.h mlkem/src/kem.c mlkem/src/kem.h mlkem/src/native/api.h mlkem/src/native/meta.h mlkem/src/native/aarch64 mlkem/src/params.h mlkem/src/poly.c mlkem/src/poly.h mlkem/src/poly_k.c mlkem/src/poly_k.h mlkem/src/randombytes.h mlkem/src/sampling.c mlkem/src/sampling.h mlkem/src/symmetric.h mlkem/src/sys.h mlkem/src/verify.c mlkem/src/verify.h mlkem/src/zetas.inc - supported_platforms: - - architecture: arm_8 - operating_systems: - - Linux - - Darwin - required_flags: - - asimd - - name: ppc64le - version: FIPS203 - folder_name: . - compile_opts: -DMLK_CONFIG_PARAMETER_SET=512 -DMLK_CONFIG_FILE="../../integration/liboqs/config_ppc64le.h" - signature_keypair: PQCP_MLKEM_NATIVE_MLKEM512_PPC64LE_keypair - signature_keypair_derand: PQCP_MLKEM_NATIVE_MLKEM512_PPC64LE_keypair_derand - signature_enc: PQCP_MLKEM_NATIVE_MLKEM512_PPC64LE_enc - signature_dec: PQCP_MLKEM_NATIVE_MLKEM512_PPC64LE_dec - sources: integration/liboqs/config_ppc64le.h mlkem/src/cbmc.h mlkem/src/common.h mlkem/src/compress.c mlkem/src/compress.h mlkem/src/debug.c mlkem/src/debug.h mlkem/src/indcpa.c mlkem/src/indcpa.h mlkem/src/kem.c mlkem/src/kem.h mlkem/src/native/api.h mlkem/src/native/meta.h mlkem/src/native/aarch64 mlkem/src/params.h mlkem/src/poly.c mlkem/src/poly.h mlkem/src/poly_k.c mlkem/src/poly_k.h mlkem/src/randombytes.h mlkem/src/sampling.c mlkem/src/sampling.h mlkem/src/symmetric.h mlkem/src/sys.h mlkem/src/verify.c mlkem/src/verify.h mlkem/src/zetas.inc - supported_platforms: - - architecture: ppc64le - operating_systems: - - Linux +- name: ref + version: FIPS203 + folder_name: . + compile_opts: -DMLK_CONFIG_PARAMETER_SET=512 -DMLK_CONFIG_FILE="../../integration/liboqs/config_c.h" + signature_keypair: PQCP_MLKEM_NATIVE_MLKEM512_C_keypair + signature_keypair_derand: PQCP_MLKEM_NATIVE_MLKEM512_C_keypair_derand + signature_enc: PQCP_MLKEM_NATIVE_MLKEM512_C_enc + signature_enc_derand: PQCP_MLKEM_NATIVE_MLKEM512_C_enc_derand + signature_dec: PQCP_MLKEM_NATIVE_MLKEM512_C_dec + sources: integration/liboqs/config_c.h integration/liboqs/fips202_glue.h integration/liboqs/fips202x4_glue.h + mlkem/src/cbmc.h mlkem/src/common.h mlkem/src/compress.c mlkem/src/compress.h + mlkem/src/debug.c mlkem/src/debug.h mlkem/src/indcpa.c mlkem/src/indcpa.h mlkem/src/kem.c + mlkem/src/kem.h mlkem/src/native/api.h mlkem/src/native/meta.h mlkem/src/params.h + mlkem/src/poly.c mlkem/src/poly.h mlkem/src/poly_k.c mlkem/src/poly_k.h mlkem/src/randombytes.h + mlkem/src/sampling.c mlkem/src/sampling.h mlkem/src/symmetric.h mlkem/src/sys.h + mlkem/src/verify.c mlkem/src/verify.h mlkem/src/zetas.inc +- name: x86_64 + version: FIPS203 + folder_name: . + compile_opts: -DMLK_CONFIG_PARAMETER_SET=512 -DMLK_CONFIG_FILE="../../integration/liboqs/config_x86_64.h" + signature_keypair: PQCP_MLKEM_NATIVE_MLKEM512_X86_64_keypair + signature_keypair_derand: PQCP_MLKEM_NATIVE_MLKEM512_X86_64_keypair_derand + signature_enc: PQCP_MLKEM_NATIVE_MLKEM512_X86_64_enc + signature_enc_derand: PQCP_MLKEM_NATIVE_MLKEM512_X86_64_enc_derand + signature_dec: PQCP_MLKEM_NATIVE_MLKEM512_X86_64_dec + sources: integration/liboqs/config_x86_64.h integration/liboqs/fips202_glue.h integration/liboqs/fips202x4_glue.h + mlkem/src/cbmc.h mlkem/src/common.h mlkem/src/compress.c mlkem/src/compress.h + mlkem/src/debug.c mlkem/src/debug.h mlkem/src/indcpa.c mlkem/src/indcpa.h mlkem/src/kem.c + mlkem/src/kem.h mlkem/src/native/api.h mlkem/src/native/meta.h mlkem/src/params.h + mlkem/src/poly.c mlkem/src/poly.h mlkem/src/poly_k.c mlkem/src/poly_k.h mlkem/src/randombytes.h + mlkem/src/sampling.c mlkem/src/sampling.h mlkem/src/symmetric.h mlkem/src/sys.h + mlkem/src/verify.c mlkem/src/verify.h mlkem/src/zetas.inc mlkem/src/native/x86_64 + supported_platforms: + - architecture: x86_64 + operating_systems: + - Linux + - Darwin + required_flags: + - avx2 + - bmi2 + - popcnt +- name: aarch64 + version: FIPS203 + folder_name: . + compile_opts: -DMLK_CONFIG_PARAMETER_SET=512 -DMLK_CONFIG_FILE="../../integration/liboqs/config_aarch64.h" + signature_keypair: PQCP_MLKEM_NATIVE_MLKEM512_AARCH64_keypair + signature_keypair_derand: PQCP_MLKEM_NATIVE_MLKEM512_AARCH64_keypair_derand + signature_enc: PQCP_MLKEM_NATIVE_MLKEM512_AARCH64_enc + signature_enc_derand: PQCP_MLKEM_NATIVE_MLKEM512_AARCH64_enc_derand + signature_dec: PQCP_MLKEM_NATIVE_MLKEM512_AARCH64_dec + sources: integration/liboqs/config_aarch64.h integration/liboqs/fips202_glue.h integration/liboqs/fips202x4_glue.h + mlkem/src/cbmc.h mlkem/src/common.h mlkem/src/compress.c mlkem/src/compress.h + mlkem/src/debug.c mlkem/src/debug.h mlkem/src/indcpa.c mlkem/src/indcpa.h mlkem/src/kem.c + mlkem/src/kem.h mlkem/src/native/api.h mlkem/src/native/meta.h mlkem/src/params.h + mlkem/src/poly.c mlkem/src/poly.h mlkem/src/poly_k.c mlkem/src/poly_k.h mlkem/src/randombytes.h + mlkem/src/sampling.c mlkem/src/sampling.h mlkem/src/symmetric.h mlkem/src/sys.h + mlkem/src/verify.c mlkem/src/verify.h mlkem/src/zetas.inc mlkem/src/native/aarch64 + supported_platforms: + - architecture: arm_8 + operating_systems: + - Linux + - Darwin + required_flags: + - asimd +- name: ppc64le + version: FIPS203 + folder_name: . + compile_opts: -DMLK_CONFIG_PARAMETER_SET=512 -DMLK_CONFIG_FILE="../../integration/liboqs/config_ppc64le.h" + signature_keypair: PQCP_MLKEM_NATIVE_MLKEM512_PPC64LE_keypair + signature_keypair_derand: PQCP_MLKEM_NATIVE_MLKEM512_PPC64LE_keypair_derand + signature_enc: PQCP_MLKEM_NATIVE_MLKEM512_PPC64LE_enc + signature_dec: PQCP_MLKEM_NATIVE_MLKEM512_PPC64LE_dec + sources: integration/liboqs/config_ppc64le.h mlkem/src/cbmc.h mlkem/src/common.h mlkem/src/compress.c mlkem/src/compress.h mlkem/src/debug.c mlkem/src/debug.h mlkem/src/indcpa.c mlkem/src/indcpa.h mlkem/src/kem.c mlkem/src/kem.h mlkem/src/native/api.h mlkem/src/native/meta.h mlkem/src/native/aarch64 mlkem/src/params.h mlkem/src/poly.c mlkem/src/poly.h mlkem/src/poly_k.c mlkem/src/poly_k.h mlkem/src/randombytes.h mlkem/src/sampling.c mlkem/src/sampling.h mlkem/src/symmetric.h mlkem/src/sys.h mlkem/src/verify.c mlkem/src/verify.h mlkem/src/zetas.inc + supported_platforms: + - architecture: ppc64le + operating_systems: + - Linux diff --git a/integration/liboqs/ML-KEM-768_META.yml b/integration/liboqs/ML-KEM-768_META.yml index e230f3ba6..80b05ba45 100644 --- a/integration/liboqs/ML-KEM-768_META.yml +++ b/integration/liboqs/ML-KEM-768_META.yml @@ -9,74 +9,96 @@ length-ciphertext: 1088 length-secret-key: 2400 length-shared-secret: 32 length-keypair-seed: 64 +length-encaps-seed: 32 nistkat-sha256: 5352539586b6c3df58be6158a6250aeff402bd73060b0a3de68850ac074c17c3 testvectors-sha256: 667c8ca2ca93729c0df6ff24588460bad1bbdbfb64ece0fe8563852a7ff348c6 principal-submitters: - - Peter Schwabe +- Peter Schwabe auxiliary-submitters: - - Roberto Avanzi - - Joppe Bos - - Léo Ducas - - Eike Kiltz - - Tancrède Lepoint - - Vadim Lyubashevsky - - John M. Schanck - - Gregor Seiler - - Damien Stehlé +- Roberto Avanzi +- Joppe Bos +- Léo Ducas +- Eike Kiltz +- Tancrède Lepoint +- Vadim Lyubashevsky +- John M. Schanck +- Gregor Seiler +- Damien Stehlé implementations: - - name: ref - version: FIPS203 - folder_name: . - compile_opts: -DMLK_CONFIG_PARAMETER_SET=768 -DMLK_CONFIG_FILE="../../integration/liboqs/config_c.h" - signature_keypair: PQCP_MLKEM_NATIVE_MLKEM768_C_keypair - signature_keypair_derand: PQCP_MLKEM_NATIVE_MLKEM768_C_keypair_derand - signature_enc: PQCP_MLKEM_NATIVE_MLKEM768_C_enc - signature_dec: PQCP_MLKEM_NATIVE_MLKEM768_C_dec - sources: integration/liboqs/config_c.h integration/liboqs/fips202_glue.h integration/liboqs/fips202x4_glue.h mlkem/src/cbmc.h mlkem/src/common.h mlkem/src/compress.c mlkem/src/compress.h mlkem/src/debug.c mlkem/src/debug.h mlkem/src/indcpa.c mlkem/src/indcpa.h mlkem/src/kem.c mlkem/src/kem.h mlkem/src/native/api.h mlkem/src/native/meta.h mlkem/src/params.h mlkem/src/poly.c mlkem/src/poly.h mlkem/src/poly_k.c mlkem/src/poly_k.h mlkem/src/randombytes.h mlkem/src/sampling.c mlkem/src/sampling.h mlkem/src/symmetric.h mlkem/src/sys.h mlkem/src/verify.c mlkem/src/verify.h mlkem/src/zetas.inc - - name: x86_64 - version: FIPS203 - folder_name: . - compile_opts: -DMLK_CONFIG_PARAMETER_SET=768 -DMLK_CONFIG_FILE="../../integration/liboqs/config_x86_64.h" - signature_keypair: PQCP_MLKEM_NATIVE_MLKEM768_X86_64_keypair - signature_keypair_derand: PQCP_MLKEM_NATIVE_MLKEM768_X86_64_keypair_derand - signature_enc: PQCP_MLKEM_NATIVE_MLKEM768_X86_64_enc - signature_dec: PQCP_MLKEM_NATIVE_MLKEM768_X86_64_dec - sources: integration/liboqs/config_x86_64.h integration/liboqs/fips202_glue.h integration/liboqs/fips202x4_glue.h mlkem/src/cbmc.h mlkem/src/common.h mlkem/src/compress.c mlkem/src/compress.h mlkem/src/debug.c mlkem/src/debug.h mlkem/src/indcpa.c mlkem/src/indcpa.h mlkem/src/kem.c mlkem/src/kem.h mlkem/src/native/api.h mlkem/src/native/meta.h mlkem/src/native/x86_64 mlkem/src/params.h mlkem/src/poly.c mlkem/src/poly.h mlkem/src/poly_k.c mlkem/src/poly_k.h mlkem/src/randombytes.h mlkem/src/sampling.c mlkem/src/sampling.h mlkem/src/symmetric.h mlkem/src/sys.h mlkem/src/verify.c mlkem/src/verify.h mlkem/src/zetas.inc - supported_platforms: - - architecture: x86_64 - operating_systems: - - Linux - - Darwin - required_flags: - - avx2 - - bmi2 - - popcnt - - name: aarch64 - version: FIPS203 - folder_name: . - compile_opts: -DMLK_CONFIG_PARAMETER_SET=768 -DMLK_CONFIG_FILE="../../integration/liboqs/config_aarch64.h" - signature_keypair: PQCP_MLKEM_NATIVE_MLKEM768_AARCH64_keypair - signature_keypair_derand: PQCP_MLKEM_NATIVE_MLKEM768_AARCH64_keypair_derand - signature_enc: PQCP_MLKEM_NATIVE_MLKEM768_AARCH64_enc - signature_dec: PQCP_MLKEM_NATIVE_MLKEM768_AARCH64_dec - sources: integration/liboqs/config_aarch64.h integration/liboqs/fips202_glue.h integration/liboqs/fips202x4_glue.h mlkem/src/cbmc.h mlkem/src/common.h mlkem/src/compress.c mlkem/src/compress.h mlkem/src/debug.c mlkem/src/debug.h mlkem/src/indcpa.c mlkem/src/indcpa.h mlkem/src/kem.c mlkem/src/kem.h mlkem/src/native/api.h mlkem/src/native/meta.h mlkem/src/native/aarch64 mlkem/src/params.h mlkem/src/poly.c mlkem/src/poly.h mlkem/src/poly_k.c mlkem/src/poly_k.h mlkem/src/randombytes.h mlkem/src/sampling.c mlkem/src/sampling.h mlkem/src/symmetric.h mlkem/src/sys.h mlkem/src/verify.c mlkem/src/verify.h mlkem/src/zetas.inc - supported_platforms: - - architecture: arm_8 - operating_systems: - - Linux - - Darwin - required_flags: - - asimd - - name: ppc64le - version: FIPS203 - folder_name: . - compile_opts: -DMLK_CONFIG_PARAMETER_SET=768 -DMLK_CONFIG_FILE="....//integration/liboqs/config_ppc64le.h" - signature_keypair: PQCP_MLKEM_NATIVE_MLKEM768_PPC64LE_keypair - signature_keypair_derand: PQCP_MLKEM_NATIVE_MLKEM768_PPC64LE_keypair_derand - signature_enc: PQCP_MLKEM_NATIVE_MLKEM768_PPC64LE_enc - signature_dec: PQCP_MLKEM_NATIVE_MLKEM768_PPC64LE_dec - sources: integration/liboqs/config_ppc64le.h mlkem/src/cbmc.h mlkem/src/common.h mlkem/src/compress.c mlkem/src/compress.h mlkem/src/debug.c mlkem/src/debug.h mlkem/src/indcpa.c mlkem/src/indcpa.h mlkem/src/kem.c mlkem/src/kem.h mlkem/src/native/api.h mlkem/src/native/meta.h mlkem/src/native/aarch64 mlkem/src/params.h mlkem/src/poly.c mlkem/src/poly.h mlkem/src/poly_k.c mlkem/src/poly_k.h mlkem/src/randombytes.h mlkem/src/sampling.c mlkem/src/sampling.h mlkem/src/symmetric.h mlkem/src/sys.h mlkem/src/verify.c mlkem/src/verify.h mlkem/src/zetas.inc - supported_platforms: - - architecture: ppc64le - operating_systems: - - Linux +- name: ref + version: FIPS203 + folder_name: . + compile_opts: -DMLK_CONFIG_PARAMETER_SET=768 -DMLK_CONFIG_FILE="../../integration/liboqs/config_c.h" + signature_keypair: PQCP_MLKEM_NATIVE_MLKEM768_C_keypair + signature_keypair_derand: PQCP_MLKEM_NATIVE_MLKEM768_C_keypair_derand + signature_enc: PQCP_MLKEM_NATIVE_MLKEM768_C_enc + signature_enc_derand: PQCP_MLKEM_NATIVE_MLKEM768_C_enc_derand + signature_dec: PQCP_MLKEM_NATIVE_MLKEM768_C_dec + sources: integration/liboqs/config_c.h integration/liboqs/fips202_glue.h integration/liboqs/fips202x4_glue.h + mlkem/src/cbmc.h mlkem/src/common.h mlkem/src/compress.c mlkem/src/compress.h + mlkem/src/debug.c mlkem/src/debug.h mlkem/src/indcpa.c mlkem/src/indcpa.h mlkem/src/kem.c + mlkem/src/kem.h mlkem/src/native/api.h mlkem/src/native/meta.h mlkem/src/params.h + mlkem/src/poly.c mlkem/src/poly.h mlkem/src/poly_k.c mlkem/src/poly_k.h mlkem/src/randombytes.h + mlkem/src/sampling.c mlkem/src/sampling.h mlkem/src/symmetric.h mlkem/src/sys.h + mlkem/src/verify.c mlkem/src/verify.h mlkem/src/zetas.inc +- name: x86_64 + version: FIPS203 + folder_name: . + compile_opts: -DMLK_CONFIG_PARAMETER_SET=768 -DMLK_CONFIG_FILE="../../integration/liboqs/config_x86_64.h" + signature_keypair: PQCP_MLKEM_NATIVE_MLKEM768_X86_64_keypair + signature_keypair_derand: PQCP_MLKEM_NATIVE_MLKEM768_X86_64_keypair_derand + signature_enc: PQCP_MLKEM_NATIVE_MLKEM768_X86_64_enc + signature_enc_derand: PQCP_MLKEM_NATIVE_MLKEM768_X86_64_enc_derand + signature_dec: PQCP_MLKEM_NATIVE_MLKEM768_X86_64_dec + sources: integration/liboqs/config_x86_64.h integration/liboqs/fips202_glue.h integration/liboqs/fips202x4_glue.h + mlkem/src/cbmc.h mlkem/src/common.h mlkem/src/compress.c mlkem/src/compress.h + mlkem/src/debug.c mlkem/src/debug.h mlkem/src/indcpa.c mlkem/src/indcpa.h mlkem/src/kem.c + mlkem/src/kem.h mlkem/src/native/api.h mlkem/src/native/meta.h mlkem/src/params.h + mlkem/src/poly.c mlkem/src/poly.h mlkem/src/poly_k.c mlkem/src/poly_k.h mlkem/src/randombytes.h + mlkem/src/sampling.c mlkem/src/sampling.h mlkem/src/symmetric.h mlkem/src/sys.h + mlkem/src/verify.c mlkem/src/verify.h mlkem/src/zetas.inc mlkem/src/native/x86_64 + supported_platforms: + - architecture: x86_64 + operating_systems: + - Linux + - Darwin + required_flags: + - avx2 + - bmi2 + - popcnt +- name: aarch64 + version: FIPS203 + folder_name: . + compile_opts: -DMLK_CONFIG_PARAMETER_SET=768 -DMLK_CONFIG_FILE="../../integration/liboqs/config_aarch64.h" + signature_keypair: PQCP_MLKEM_NATIVE_MLKEM768_AARCH64_keypair + signature_keypair_derand: PQCP_MLKEM_NATIVE_MLKEM768_AARCH64_keypair_derand + signature_enc: PQCP_MLKEM_NATIVE_MLKEM768_AARCH64_enc + signature_enc_derand: PQCP_MLKEM_NATIVE_MLKEM768_AARCH64_enc_derand + signature_dec: PQCP_MLKEM_NATIVE_MLKEM768_AARCH64_dec + sources: integration/liboqs/config_aarch64.h integration/liboqs/fips202_glue.h integration/liboqs/fips202x4_glue.h + mlkem/src/cbmc.h mlkem/src/common.h mlkem/src/compress.c mlkem/src/compress.h + mlkem/src/debug.c mlkem/src/debug.h mlkem/src/indcpa.c mlkem/src/indcpa.h mlkem/src/kem.c + mlkem/src/kem.h mlkem/src/native/api.h mlkem/src/native/meta.h mlkem/src/params.h + mlkem/src/poly.c mlkem/src/poly.h mlkem/src/poly_k.c mlkem/src/poly_k.h mlkem/src/randombytes.h + mlkem/src/sampling.c mlkem/src/sampling.h mlkem/src/symmetric.h mlkem/src/sys.h + mlkem/src/verify.c mlkem/src/verify.h mlkem/src/zetas.inc mlkem/src/native/aarch64 + supported_platforms: + - architecture: arm_8 + operating_systems: + - Linux + - Darwin + required_flags: + - asimd +- name: ppc64le + version: FIPS203 + folder_name: . + compile_opts: -DMLK_CONFIG_PARAMETER_SET=768 -DMLK_CONFIG_FILE="....//integration/liboqs/config_ppc64le.h" + signature_keypair: PQCP_MLKEM_NATIVE_MLKEM768_PPC64LE_keypair + signature_keypair_derand: PQCP_MLKEM_NATIVE_MLKEM768_PPC64LE_keypair_derand + signature_enc: PQCP_MLKEM_NATIVE_MLKEM768_PPC64LE_enc + signature_dec: PQCP_MLKEM_NATIVE_MLKEM768_PPC64LE_dec + sources: integration/liboqs/config_ppc64le.h mlkem/src/cbmc.h mlkem/src/common.h mlkem/src/compress.c mlkem/src/compress.h mlkem/src/debug.c mlkem/src/debug.h mlkem/src/indcpa.c mlkem/src/indcpa.h mlkem/src/kem.c mlkem/src/kem.h mlkem/src/native/api.h mlkem/src/native/meta.h mlkem/src/native/aarch64 mlkem/src/params.h mlkem/src/poly.c mlkem/src/poly.h mlkem/src/poly_k.c mlkem/src/poly_k.h mlkem/src/randombytes.h mlkem/src/sampling.c mlkem/src/sampling.h mlkem/src/symmetric.h mlkem/src/sys.h mlkem/src/verify.c mlkem/src/verify.h mlkem/src/zetas.inc + supported_platforms: + - architecture: ppc64le + operating_systems: + - Linux diff --git a/test/mk/auto.mk b/test/mk/auto.mk index b66eb724b..bcbf3ac1c 100644 --- a/test/mk/auto.mk +++ b/test/mk/auto.mk @@ -1,33 +1,113 @@ # SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT # -# Automatically detect system architecture and set preprocessor etc accordingly +# Automatically detect system architecture and set preprocessor flags accordingly +# This file detects host CPU capabilities and combines them with compiler support +# to enable optimal compilation flags. -# Native compilation -ifeq ($(CROSS_PREFIX),) +ifndef _AUTO_MK +_AUTO_MK := + +# Helper function to check if host CPU supports a feature +# Usage: $(call check_host_feature,feature_pattern,source_command) +define check_host_feature +$(shell $(2) 2>/dev/null | grep -q "$(1)" && echo 1 || echo 0) +endef + +# x86_64 architecture detection +ifeq ($(ARCH),x86_64) + +# Host CPU feature detection for x86_64 ifeq ($(HOST_PLATFORM),Linux-x86_64) - CFLAGS += -mavx2 -mbmi2 -mpopcnt -maes - CFLAGS += -DMLK_FORCE_X86_64 -else ifeq ($(HOST_PLATFORM),Linux-aarch64) - CFLAGS += -DMLK_FORCE_AARCH64 +# Linux: Use /proc/cpuinfo +MK_HOST_SUPPORTS_AVX2 := $(call check_host_feature,avx2,cat /proc/cpuinfo) +MK_HOST_SUPPORTS_SSE2 := $(call check_host_feature,sse2,cat /proc/cpuinfo) +MK_HOST_SUPPORTS_BMI2 := $(call check_host_feature,bmi2,cat /proc/cpuinfo) +else ifeq ($(HOST_PLATFORM),Darwin-x86_64) +# macOS: Use sysctl +MK_HOST_SUPPORTS_AVX2 := $(call check_host_feature,AVX2,sysctl -n machdep.cpu.leaf7_features) +MK_HOST_SUPPORTS_SSE2 := $(call check_host_feature,SSE2,sysctl -n machdep.cpu.features) +MK_HOST_SUPPORTS_BMI2 := $(call check_host_feature,BMI2,sysctl -n machdep.cpu.leaf7_features) +else ifneq ($(CROSS_PREFIX),) +# Cross-compilation: assume all features are supported +MK_HOST_SUPPORTS_AVX2 := 1 +MK_HOST_SUPPORTS_SSE2 := 1 +MK_HOST_SUPPORTS_BMI2 := 1 +else +# Other platforms: assume no support +MK_HOST_SUPPORTS_AVX2 := 0 +MK_HOST_SUPPORTS_SSE2 := 0 +MK_HOST_SUPPORTS_BMI2 := 0 +endif # HOST_PLATFORM x86_64 + +endif # x86_64 + +# AArch64 architecture detection +ifeq ($(ARCH),aarch64) + +# Host CPU feature detection for AArch64 +ifeq ($(HOST_PLATFORM),Linux-aarch64) +# Linux: Use /proc/cpuinfo (look for sha3 in Features line) +MK_HOST_SUPPORTS_SHA3 := $(call check_host_feature,sha3,cat /proc/cpuinfo) else ifeq ($(HOST_PLATFORM),Darwin-arm64) - CFLAGS += -DMLK_FORCE_AARCH64 -else ifeq ($(HOST_PLATFORM),Linux-ppc64le) - CFLAGS += -DMLK_FORCE_PPC64LE +# macOS: Use sysctl to check for SHA3 support +MK_HOST_SUPPORTS_SHA3 := $(call check_host_feature,1,sysctl -n hw.optional.armv8_2_sha3) +else ifneq ($(CROSS_PREFIX),) +# Cross-compilation: assume all features are supported +MK_HOST_SUPPORTS_SHA3 := 1 +else +# Other platforms: assume no support +MK_HOST_SUPPORTS_SHA3 := 0 +endif # HOST_PLATFORM aarch64 + +endif # aarch64 + +# Only apply CFLAGS modifications if AUTO=1 +ifeq ($(AUTO),1) + +# x86_64 CFLAGS configuration +ifeq ($(ARCH),x86_64) +CFLAGS += -DMLK_FORCE_X86_64 + +# Add flags only if both compiler and host support the feature +ifeq ($(MK_COMPILER_SUPPORTS_AVX2)$(MK_HOST_SUPPORTS_AVX2),11) +CFLAGS += -mavx2 endif -# Cross compilation -else ifneq ($(findstring x86_64, $(CROSS_PREFIX)),) - CFLAGS += -mavx2 -mbmi2 -mpopcnt -maes - CFLAGS += -DMLK_FORCE_X86_64 -else ifneq ($(findstring aarch64_be, $(CROSS_PREFIX)),) - CFLAGS += -DMLK_FORCE_AARCH64_EB -else ifneq ($(findstring aarch64, $(CROSS_PREFIX)),) - CFLAGS += -DMLK_FORCE_AARCH64 -else ifneq ($(findstring riscv64, $(CROSS_PREFIX)),) - CFLAGS += -DMLK_FORCE_RISCV64 -else ifneq ($(findstring riscv32, $(CROSS_PREFIX)),) - CFLAGS += -DMLK_FORCE_RISCV32 -else ifneq ($(findstring powerpc64le, $(CROSS_PREFIX)),) - CFLAGS += -DMLK_FORCE_PPC64LE -else ifneq ($(findstring ppc64le, $(CROSS_PREFIX)),) - CFLAGS += -DMLK_FORCE_PPC64LE + +ifeq ($(MK_COMPILER_SUPPORTS_BMI2)$(MK_HOST_SUPPORTS_BMI2),11) +CFLAGS += -mbmi2 endif +endif # x86_64 + +# AArch64 CFLAGS configuration +ifeq ($(ARCH),aarch64) +CFLAGS += -DMLK_FORCE_AARCH64 + +# Add SHA3 flags only if both compiler and host support it +ifeq ($(MK_COMPILER_SUPPORTS_SHA3)$(MK_HOST_SUPPORTS_SHA3),11) +CFLAGS += -march=armv8.4-a+sha3 +endif +endif # aarch64 + +# AArch64 Big Endian CFLAGS configuration +ifeq ($(ARCH),aarch64_be) +CFLAGS += -DMLK_FORCE_AARCH64_EB +endif # aarch64_be + +# RISC-V 64-bit CFLAGS configuration +ifeq ($(ARCH),riscv64) +CFLAGS += -DMLK_FORCE_RISCV64 +endif # riscv64 + +# RISC-V 32-bit CFLAGS configuration +ifeq ($(ARCH),riscv32) +CFLAGS += -DMLK_FORCE_RISCV32 +endif # riscv32 + +# PowerPC 64-bit Little Endian CFLAGS configuration +ifeq ($(ARCH),powerpc64le) +CFLAGS += -DMLK_FORCE_PPC64LE +endif # powerpc64le + +endif # AUTO=1 + +endif # _AUTO_MK diff --git a/test/mk/components.mk b/test/mk/components.mk index 77f9f3212..f3b1f959d 100644 --- a/test/mk/components.mk +++ b/test/mk/components.mk @@ -12,7 +12,7 @@ ifeq ($(OPT),1) CFLAGS += -DMLK_CONFIG_USE_NATIVE_BACKEND_ARITH -DMLK_CONFIG_USE_NATIVE_BACKEND_FIPS202 endif -ALL_TESTS = test_mlkem acvp_mlkem bench_mlkem bench_components_mlkem gen_KAT +ALL_TESTS = test_mlkem acvp_mlkem bench_mlkem bench_components_mlkem gen_KAT test_stack MLKEM512_DIR = $(BUILD_DIR)/mlkem512 MLKEM768_DIR = $(BUILD_DIR)/mlkem768 @@ -25,6 +25,9 @@ $(MLKEM768_OBJS): CFLAGS += -DMLK_CONFIG_PARAMETER_SET=768 MLKEM1024_OBJS = $(call MAKE_OBJS,$(MLKEM1024_DIR),$(SOURCES) $(FIPS202_SRCS)) $(MLKEM1024_OBJS): CFLAGS += -DMLK_CONFIG_PARAMETER_SET=1024 + + + $(BUILD_DIR)/libmlkem512.a: $(MLKEM512_OBJS) $(BUILD_DIR)/libmlkem768.a: $(MLKEM768_OBJS) $(BUILD_DIR)/libmlkem1024.a: $(MLKEM1024_OBJS) @@ -38,6 +41,10 @@ $(MLKEM512_DIR)/bin/bench_components_mlkem512: CFLAGS += -Itest/hal $(MLKEM768_DIR)/bin/bench_components_mlkem768: CFLAGS += -Itest/hal $(MLKEM1024_DIR)/bin/bench_components_mlkem1024: CFLAGS += -Itest/hal +$(MLKEM512_DIR)/bin/test_stack512: CFLAGS += -Imlkem/src -fstack-usage +$(MLKEM768_DIR)/bin/test_stack768: CFLAGS += -Imlkem/src -fstack-usage +$(MLKEM1024_DIR)/bin/test_stack1024: CFLAGS += -Imlkem/src -fstack-usage + $(MLKEM512_DIR)/bin/bench_mlkem512: $(MLKEM512_DIR)/test/hal/hal.c.o $(MLKEM768_DIR)/bin/bench_mlkem768: $(MLKEM768_DIR)/test/hal/hal.c.o $(MLKEM1024_DIR)/bin/bench_mlkem1024: $(MLKEM1024_DIR)/test/hal/hal.c.o From 5f71ef645f13d401afbd4ecb63320020742fa590 Mon Sep 17 00:00:00 2001 From: Danny Tsen Date: Thu, 18 Sep 2025 10:52:25 -0500 Subject: [PATCH 07/16] Fixed format and styling by using autogen but no simpasm was run. Did some more comments on value bounds in ASM files. Used constants array instead of embedded data in assembly files. Autogen was run under nix env on Mac. Tests were run under HW p10. [05:52] danny@ltcden12-lp4 mlkem_dev % ./scripts/tests func INFO > Functional Test Compile (native no_opt): make func OPT=0 AUTO=1 -j8 INFO > Functional Test ML-KEM-512 (native no_opt): make run_func_512 -j8 INFO > Functional Test ML-KEM-768 (native no_opt): make run_func_768 -j8 INFO > Functional Test ML-KEM-1024 (native no_opt): make run_func_1024 -j8 INFO > Functional Test Compile (native opt): make func OPT=1 AUTO=1 -j8 INFO > Functional Test ML-KEM-512 (native opt): make run_func_512 -j8 INFO > Functional Test ML-KEM-768 (native opt): make run_func_768 -j8 INFO > Functional Test ML-KEM-1024 (native opt): make run_func_1024 -j8 All good! Signed-off-by: Danny Tsen --- BIBLIOGRAPHY.md | 1 + dev/ppc64le/meta.h | 34 +-- dev/ppc64le/src/arith_native_ppc64le.h | 15 +- dev/ppc64le/src/consts.c | 155 +++++++++++ dev/ppc64le/src/consts.h | 26 ++ dev/ppc64le/src/intt_ppc.S | 239 +++++------------ dev/ppc64le/src/ntt_ppc.S | 188 ++++--------- dev/ppc64le/src/poly_tomont.S | 36 ++- dev/ppc64le/src/reduce.S | 48 ++-- integration/liboqs/ML-KEM-1024_META.yml | 8 +- integration/liboqs/ML-KEM-512_META.yml | 8 +- integration/liboqs/ML-KEM-768_META.yml | 8 +- mlkem/mlkem_native.S | 27 ++ mlkem/mlkem_native.c | 27 ++ mlkem/src/native/meta.h | 2 +- mlkem/src/native/ppc64le/meta.h | 30 ++- .../native/ppc64le/src/arith_native_ppc64le.h | 11 +- mlkem/src/native/ppc64le/src/consts.c | 155 +++++++++++ mlkem/src/native/ppc64le/src/consts.h | 26 ++ mlkem/src/native/ppc64le/src/intt_ppc.S | 252 ++++++------------ mlkem/src/native/ppc64le/src/ntt_ppc.S | 192 ++++--------- mlkem/src/native/ppc64le/src/poly_tomont.S | 38 +-- mlkem/src/native/ppc64le/src/reduce.S | 55 ++-- 23 files changed, 831 insertions(+), 750 deletions(-) create mode 100644 dev/ppc64le/src/consts.c create mode 100644 dev/ppc64le/src/consts.h create mode 100644 mlkem/src/native/ppc64le/src/consts.c create mode 100644 mlkem/src/native/ppc64le/src/consts.h diff --git a/BIBLIOGRAPHY.md b/BIBLIOGRAPHY.md index ba4ff9718..e8c0bca7b 100644 --- a/BIBLIOGRAPHY.md +++ b/BIBLIOGRAPHY.md @@ -28,6 +28,7 @@ source code and documentation. * Referenced from: - [integration/liboqs/config_aarch64.h](integration/liboqs/config_aarch64.h) - [integration/liboqs/config_c.h](integration/liboqs/config_c.h) + - [integration/liboqs/config_ppc64le.h](integration/liboqs/config_ppc64le.h) - [integration/liboqs/config_x86_64.h](integration/liboqs/config_x86_64.h) - [mlkem/src/config.h](mlkem/src/config.h) - [mlkem/src/kem.c](mlkem/src/kem.c) diff --git a/dev/ppc64le/meta.h b/dev/ppc64le/meta.h index bee788976..34f8cbec6 100644 --- a/dev/ppc64le/meta.h +++ b/dev/ppc64le/meta.h @@ -3,8 +3,8 @@ * SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT */ -#ifndef MLK_NATIVE_PPC64LE_META_H -#define MLK_NATIVE_PPC64LE_META_H +#ifndef MLK_DEV_PPC64LE_META_H +#define MLK_DEV_PPC64LE_META_H /* Identifier for this backend so that source and assembly files * in the build can be appropriately guarded. */ @@ -25,25 +25,29 @@ #include "../api.h" #include "src/arith_native_ppc64le.h" -static MLK_INLINE int mlk_ntt_native(int16_t data[MLKEM_N]) { - mlk_ntt_ppc(data); - return MLK_NATIVE_FUNC_SUCCESS; +static MLK_INLINE int mlk_ntt_native(int16_t data[MLKEM_N]) +{ + mlk_ntt_ppc(data, mlk_ppc_qdata); + return MLK_NATIVE_FUNC_SUCCESS; } -static MLK_INLINE int mlk_intt_native(int16_t data[MLKEM_N]) { - mlk_intt_ppc(data); - return MLK_NATIVE_FUNC_SUCCESS; +static MLK_INLINE int mlk_intt_native(int16_t data[MLKEM_N]) +{ + mlk_intt_ppc(data, mlk_ppc_qdata); + return MLK_NATIVE_FUNC_SUCCESS; } -static MLK_INLINE int mlk_poly_reduce_native(int16_t data[MLKEM_N]) { - mlk_reduce_ppc(data); - return MLK_NATIVE_FUNC_SUCCESS; +static MLK_INLINE int mlk_poly_reduce_native(int16_t data[MLKEM_N]) +{ + mlk_reduce_ppc(data, mlk_ppc_qdata); + return MLK_NATIVE_FUNC_SUCCESS; } -static MLK_INLINE int mlk_poly_tomont_native(int16_t data[MLKEM_N]) { - mlk_poly_tomont_ppc(data); - return MLK_NATIVE_FUNC_SUCCESS; +static MLK_INLINE int mlk_poly_tomont_native(int16_t data[MLKEM_N]) +{ + mlk_poly_tomont_ppc(data, mlk_ppc_qdata); + return MLK_NATIVE_FUNC_SUCCESS; } #endif /* !__ASSEMBLER__ */ -#endif /* MLK_NATIVE_PPC64LE_META_H */ +#endif /* !MLK_DEV_PPC64LE_META_H */ diff --git a/dev/ppc64le/src/arith_native_ppc64le.h b/dev/ppc64le/src/arith_native_ppc64le.h index 57f0b8f8c..1c7534668 100644 --- a/dev/ppc64le/src/arith_native_ppc64le.h +++ b/dev/ppc64le/src/arith_native_ppc64le.h @@ -2,22 +2,23 @@ * Copyright (c) 2024-2025 The mlkem-native project authors * SPDX-License-Identifier: Apache-2.0 */ -#ifndef MLK_NATIVE_PPC64LE_SRC_ARITH_NATIVE_PPC64LE_H -#define MLK_NATIVE_PPC64LE_SRC_ARITH_NATIVE_PPC64LE_H +#ifndef MLK_DEV_PPC64LE_SRC_ARITH_NATIVE_PPC64LE_H +#define MLK_DEV_PPC64LE_SRC_ARITH_NATIVE_PPC64LE_H #include #include "../../../common.h" +#include "consts.h" #define mlk_ntt_ppc MLK_NAMESPACE(ntt_ppc) -void mlk_ntt_ppc(int16_t *); +void mlk_ntt_ppc(int16_t *, const int16_t *); #define mlk_intt_ppc MLK_NAMESPACE(intt_ppc) -void mlk_intt_ppc(int16_t *); +void mlk_intt_ppc(int16_t *, const int16_t *); #define mlk_reduce_ppc MLK_NAMESPACE(reduce_ppc) -void mlk_reduce_ppc(int16_t *r); +void mlk_reduce_ppc(int16_t *r, const int16_t *); #define mlk_poly_tomont_ppc MLK_NAMESPACE(poly_tomont_ppc) -void mlk_poly_tomont_ppc(int16_t *); +void mlk_poly_tomont_ppc(int16_t *, const int16_t *); -#endif /* MLK_NATIVE_PPC64LE_SRC_ARITH_NATIVE_PPC64LE_H */ +#endif /* !MLK_DEV_PPC64LE_SRC_ARITH_NATIVE_PPC64LE_H */ diff --git a/dev/ppc64le/src/consts.c b/dev/ppc64le/src/consts.c new file mode 100644 index 000000000..4c2fbdf61 --- /dev/null +++ b/dev/ppc64le/src/consts.c @@ -0,0 +1,155 @@ +/* + * Copyright (c) The mlkem-native project authors + * SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT + */ + +#include "../../../common.h" + +#if defined(MLK_ARITH_BACKEND_PPC64LE_DEFAULT) && \ + !defined(MLK_CONFIG_MULTILEVEL_NO_SHARED) + +#include "consts.h" + +MLK_ALIGN const int16_t mlk_ppc_qdata[1568] = { + /* -Q */ + -3329, -3329, -3329, -3329, -3329, -3329, -3329, -3329, + /* QINV */ + -3327, -3327, -3327, -3327, -3327, -3327, -3327, -3327, + /* Q */ + 3329, 3329, 3329, 3329, 3329, 3329, 3329, 3329, + /* const 20159 for reduce.S and intt */ + 20159, 20159, 20159, 20159, 20159, 20159, 20159, 20159, + /* const 1441 for intt */ + 1441, 1441, 1441, 1441, 1441, 1441, 1441, 1441, + /* for poly_tomont.S */ + 1353, 1353, 1353, 1353, 1353, 1353, 1353, 1353, + /* zetas */ + /* For ntt Len=128, offset 96 */ + -758, -758, -758, -758, -758, -758, -758, -758, -359, -359, -359, -359, + -359, -359, -359, -359, -1517, -1517, -1517, -1517, -1517, -1517, -1517, + -1517, 1493, 1493, 1493, 1493, 1493, 1493, 1493, 1493, 1422, 1422, 1422, + 1422, 1422, 1422, 1422, 1422, 287, 287, 287, 287, 287, 287, 287, 287, 202, + 202, 202, 202, 202, 202, 202, 202, -171, -171, -171, -171, -171, -171, -171, + -171, 622, 622, 622, 622, 622, 622, 622, 622, 1577, 1577, 1577, 1577, 1577, + 1577, 1577, 1577, 182, 182, 182, 182, 182, 182, 182, 182, 962, 962, 962, + 962, 962, 962, 962, 962, -1202, -1202, -1202, -1202, -1202, -1202, -1202, + -1202, -1474, -1474, -1474, -1474, -1474, -1474, -1474, -1474, 1468, 1468, + 1468, 1468, 1468, 1468, 1468, 1468, 573, 573, 573, 573, 573, 573, 573, 573, + -1325, -1325, -1325, -1325, -1325, -1325, -1325, -1325, 264, 264, 264, 264, + 264, 264, 264, 264, 383, 383, 383, 383, 383, 383, 383, 383, -829, -829, + -829, -829, -829, -829, -829, -829, 1458, 1458, 1458, 1458, 1458, 1458, + 1458, 1458, -1602, -1602, -1602, -1602, -1602, -1602, -1602, -1602, -130, + -130, -130, -130, -130, -130, -130, -130, -681, -681, -681, -681, -681, + -681, -681, -681, 1017, 1017, 1017, 1017, 1017, 1017, 1017, 1017, 732, 732, + 732, 732, 732, 732, 732, 732, 608, 608, 608, 608, 608, 608, 608, 608, -1542, + -1542, -1542, -1542, -1542, -1542, -1542, -1542, 411, 411, 411, 411, 411, + 411, 411, 411, -205, -205, -205, -205, -205, -205, -205, -205, -1571, -1571, + -1571, -1571, -1571, -1571, -1571, -1571, 1223, 1223, 1223, 1223, 1223, + 1223, 1223, 1223, 652, 652, 652, 652, 652, 652, 652, 652, -552, -552, -552, + -552, -552, -552, -552, -552, 1015, 1015, 1015, 1015, 1015, 1015, 1015, + 1015, -1293, -1293, -1293, -1293, -1293, -1293, -1293, -1293, 1491, 1491, + 1491, 1491, 1491, 1491, 1491, 1491, -282, -282, -282, -282, -282, -282, + -282, -282, -1544, -1544, -1544, -1544, -1544, -1544, -1544, -1544, 516, + 516, 516, 516, 516, 516, 516, 516, -8, -8, -8, -8, -8, -8, -8, -8, -320, + -320, -320, -320, -320, -320, -320, -320, -666, -666, -666, -666, -666, + -666, -666, -666, -1618, -1618, -1618, -1618, -1618, -1618, -1618, -1618, + -1162, -1162, -1162, -1162, -1162, -1162, -1162, -1162, 126, 126, 126, 126, + 126, 126, 126, 126, 1469, 1469, 1469, 1469, 1469, 1469, 1469, 1469, -853, + -853, -853, -853, -853, -853, -853, -853, -90, -90, -90, -90, -90, -90, -90, + -90, -271, -271, -271, -271, -271, -271, -271, -271, 830, 830, 830, 830, + 830, 830, 830, 830, 107, 107, 107, 107, 107, 107, 107, 107, -1421, -1421, + -1421, -1421, -1421, -1421, -1421, -1421, -247, -247, -247, -247, -247, + -247, -247, -247, -951, -951, -951, -951, -951, -951, -951, -951, -398, + -398, -398, -398, -398, -398, -398, -398, 961, 961, 961, 961, 961, 961, 961, + 961, -1508, -1508, -1508, -1508, -1508, -1508, -1508, -1508, -725, -725, + -725, -725, -725, -725, -725, -725, 448, 448, 448, 448, 448, 448, 448, 448, + -1065, -1065, -1065, -1065, -1065, -1065, -1065, -1065, 677, 677, 677, 677, + 677, 677, 677, 677, -1275, -1275, -1275, -1275, -1275, -1275, -1275, -1275, + /* For intt Len=2, offset IZETA_NTT_OFFSET64 */ + -1103, -1103, -1103, -1103, 430, 430, 430, 430, 555, 555, 555, 555, 843, + 843, 843, 843, -1251, -1251, -1251, -1251, 871, 871, 871, 871, 1550, 1550, + 1550, 1550, 105, 105, 105, 105, 422, 422, 422, 422, 587, 587, 587, 587, 177, + 177, 177, 177, -235, -235, -235, -235, -291, -291, -291, -291, -460, -460, + -460, -460, 1574, 1574, 1574, 1574, 1653, 1653, 1653, 1653, -246, -246, + -246, -246, 778, 778, 778, 778, 1159, 1159, 1159, 1159, -147, -147, -147, + -147, -777, -777, -777, -777, 1483, 1483, 1483, 1483, -602, -602, -602, + -602, 1119, 1119, 1119, 1119, -1590, -1590, -1590, -1590, 644, 644, 644, + 644, -872, -872, -872, -872, 349, 349, 349, 349, 418, 418, 418, 418, 329, + 329, 329, 329, -156, -156, -156, -156, -75, -75, -75, -75, 817, 817, 817, + 817, 1097, 1097, 1097, 1097, 603, 603, 603, 603, 610, 610, 610, 610, 1322, + 1322, 1322, 1322, -1285, -1285, -1285, -1285, -1465, -1465, -1465, -1465, + 384, 384, 384, 384, -1215, -1215, -1215, -1215, -136, -136, -136, -136, + 1218, 1218, 1218, 1218, -1335, -1335, -1335, -1335, -874, -874, -874, -874, + 220, 220, 220, 220, -1187, -1187, -1187, -1187, -1659, -1659, -1659, -1659, + -1185, -1185, -1185, -1185, -1530, -1530, -1530, -1530, -1278, -1278, -1278, + -1278, 794, 794, 794, 794, -1510, -1510, -1510, -1510, -854, -854, -854, + -854, -870, -870, -870, -870, 478, 478, 478, 478, -108, -108, -108, -108, + -308, -308, -308, -308, 996, 996, 996, 996, 991, 991, 991, 991, 958, 958, + 958, 958, -1460, -1460, -1460, -1460, 1522, 1522, 1522, 1522, 1628, 1628, + 1628, 1628, + /* For intt Len=2, offset IZETA_NTT_OFFSET127 */ + 1628, 1628, 1628, 1628, 1522, 1522, 1522, 1522, -1460, -1460, -1460, -1460, + 958, 958, 958, 958, 991, 991, 991, 991, 996, 996, 996, 996, -308, -308, + -308, -308, -108, -108, -108, -108, 478, 478, 478, 478, -870, -870, -870, + -870, -854, -854, -854, -854, -1510, -1510, -1510, -1510, 794, 794, 794, + 794, -1278, -1278, -1278, -1278, -1530, -1530, -1530, -1530, -1185, -1185, + -1185, -1185, -1659, -1659, -1659, -1659, -1187, -1187, -1187, -1187, 220, + 220, 220, 220, -874, -874, -874, -874, -1335, -1335, -1335, -1335, 1218, + 1218, 1218, 1218, -136, -136, -136, -136, -1215, -1215, -1215, -1215, 384, + 384, 384, 384, -1465, -1465, -1465, -1465, -1285, -1285, -1285, -1285, 1322, + 1322, 1322, 1322, 610, 610, 610, 610, 603, 603, 603, 603, 1097, 1097, 1097, + 1097, 817, 817, 817, 817, -75, -75, -75, -75, -156, -156, -156, -156, 329, + 329, 329, 329, 418, 418, 418, 418, 349, 349, 349, 349, -872, -872, -872, + -872, 644, 644, 644, 644, -1590, -1590, -1590, -1590, 1119, 1119, 1119, + 1119, -602, -602, -602, -602, 1483, 1483, 1483, 1483, -777, -777, -777, + -777, -147, -147, -147, -147, 1159, 1159, 1159, 1159, 778, 778, 778, 778, + -246, -246, -246, -246, 1653, 1653, 1653, 1653, 1574, 1574, 1574, 1574, + -460, -460, -460, -460, -291, -291, -291, -291, -235, -235, -235, -235, 177, + 177, 177, 177, 587, 587, 587, 587, 422, 422, 422, 422, 105, 105, 105, 105, + 1550, 1550, 1550, 1550, 871, 871, 871, 871, -1251, -1251, -1251, -1251, 843, + 843, 843, 843, 555, 555, 555, 555, 430, 430, 430, 430, -1103, -1103, -1103, + -1103, + /* For intt Len=4 and others, offset IZETA_NTT_OFFSET63 */ + -1275, -1275, -1275, -1275, -1275, -1275, -1275, -1275, 677, 677, 677, 677, + 677, 677, 677, 677, -1065, -1065, -1065, -1065, -1065, -1065, -1065, -1065, + 448, 448, 448, 448, 448, 448, 448, 448, -725, -725, -725, -725, -725, -725, + -725, -725, -1508, -1508, -1508, -1508, -1508, -1508, -1508, -1508, 961, + 961, 961, 961, 961, 961, 961, 961, -398, -398, -398, -398, -398, -398, -398, + -398, -951, -951, -951, -951, -951, -951, -951, -951, -247, -247, -247, + -247, -247, -247, -247, -247, -1421, -1421, -1421, -1421, -1421, -1421, + -1421, -1421, 107, 107, 107, 107, 107, 107, 107, 107, 830, 830, 830, 830, + 830, 830, 830, 830, -271, -271, -271, -271, -271, -271, -271, -271, -90, + -90, -90, -90, -90, -90, -90, -90, -853, -853, -853, -853, -853, -853, -853, + -853, 1469, 1469, 1469, 1469, 1469, 1469, 1469, 1469, 126, 126, 126, 126, + 126, 126, 126, 126, -1162, -1162, -1162, -1162, -1162, -1162, -1162, -1162, + -1618, -1618, -1618, -1618, -1618, -1618, -1618, -1618, -666, -666, -666, + -666, -666, -666, -666, -666, -320, -320, -320, -320, -320, -320, -320, + -320, -8, -8, -8, -8, -8, -8, -8, -8, 516, 516, 516, 516, 516, 516, 516, + 516, -1544, -1544, -1544, -1544, -1544, -1544, -1544, -1544, -282, -282, + -282, -282, -282, -282, -282, -282, 1491, 1491, 1491, 1491, 1491, 1491, + 1491, 1491, -1293, -1293, -1293, -1293, -1293, -1293, -1293, -1293, 1015, + 1015, 1015, 1015, 1015, 1015, 1015, 1015, -552, -552, -552, -552, -552, + -552, -552, -552, 652, 652, 652, 652, 652, 652, 652, 652, 1223, 1223, 1223, + 1223, 1223, 1223, 1223, 1223, -1571, -1571, -1571, -1571, -1571, -1571, + -1571, -1571, -205, -205, -205, -205, -205, -205, -205, -205, 411, 411, 411, + 411, 411, 411, 411, 411, -1542, -1542, -1542, -1542, -1542, -1542, -1542, + -1542, 608, 608, 608, 608, 608, 608, 608, 608, 732, 732, 732, 732, 732, 732, + 732, 732, 1017, 1017, 1017, 1017, 1017, 1017, 1017, 1017, -681, -681, -681, + -681, -681, -681, -681, -681, -130, -130, -130, -130, -130, -130, -130, + -130, -1602, -1602, -1602, -1602, -1602, -1602, -1602, -1602, 1458, 1458, + 1458, 1458, 1458, 1458, 1458, 1458, -829, -829, -829, -829, -829, -829, + -829, -829, 383, 383, 383, 383, 383, 383, 383, 383, 264, 264, 264, 264, 264, + 264, 264, 264, -1325, -1325, -1325, -1325, -1325, -1325, -1325, -1325, 573, + 573, 573, 573, 573, 573, 573, 573, 1468, 1468, 1468, 1468, 1468, 1468, 1468, + 1468, -1474, -1474, -1474, -1474, -1474, -1474, -1474, -1474, -1202, -1202, + -1202, -1202, -1202, -1202, -1202, -1202, 962, 962, 962, 962, 962, 962, 962, + 962, 182, 182, 182, 182, 182, 182, 182, 182, 1577, 1577, 1577, 1577, 1577, + 1577, 1577, 1577, 622, 622, 622, 622, 622, 622, 622, 622, -171, -171, -171, + -171, -171, -171, -171, -171, 202, 202, 202, 202, 202, 202, 202, 202, 287, + 287, 287, 287, 287, 287, 287, 287, 1422, 1422, 1422, 1422, 1422, 1422, 1422, + 1422, 1493, 1493, 1493, 1493, 1493, 1493, 1493, 1493, -1517, -1517, -1517, + -1517, -1517, -1517, -1517, -1517, -359, -359, -359, -359, -359, -359, -359, + -359, -758, -758, -758, -758, -758, -758, -758, -758}; + +#endif /* MLK_ARITH_BACKEND_PPC64LE_DEFAULT && \ + !MLK_CONFIG_MULTILEVEL_NO_SHARED */ diff --git a/dev/ppc64le/src/consts.h b/dev/ppc64le/src/consts.h new file mode 100644 index 000000000..d424601ac --- /dev/null +++ b/dev/ppc64le/src/consts.h @@ -0,0 +1,26 @@ +/* + * Copyright (c) The mlkem-native project authors + * SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT + */ + +#ifndef MLK_DEV_PPC64LE_SRC_CONSTS_H +#define MLK_DEV_PPC64LE_SRC_CONSTS_H +#include "../../../common.h" + +#define NQ_OFFSET 0 +#define QINV_OFFSET 16 +#define Q_OFFSET 32 +#define C20159_OFFSET 48 +#define C1441_OFFSET 64 +#define C1353_OFFSET 80 +#define ZETA_NTT_OFFSET 96 +#define ZETA_NTT_OFFSET64 1104 +#define IZETA_NTT_OFFSET127 1616 +#define IZETA_NTT_OFFSET63 2128 + +#ifndef __ASSEMBLER__ +#define mlk_ppc_qdata MLK_NAMESPACE(ppc_qdata) +extern const int16_t mlk_ppc_qdata[]; +#endif + +#endif /* !MLK_DEV_PPC64LE_SRC_CONSTS_H */ diff --git a/dev/ppc64le/src/intt_ppc.S b/dev/ppc64le/src/intt_ppc.S index feb78b984..1f4b48e42 100644 --- a/dev/ppc64le/src/intt_ppc.S +++ b/dev/ppc64le/src/intt_ppc.S @@ -11,13 +11,18 @@ # #include "../../../common.h" +#if defined(MLK_ARITH_BACKEND_PPC64LE_DEFAULT) && \ + !defined(MLK_CONFIG_MULTILEVEL_NO_SHARED) +/* simpasm: header-end */ + +#include "consts.h" .machine "any" .text # Barrett reduce constatnts #define V20159 0 -#define V_25 1 +#define V_25 1 #define V_26 2 #define V_MKQ 3 @@ -29,11 +34,11 @@ #define V_Z2 9 #define V_Z3 10 #define V_ZETA 10 -#define V1441 10 +#define V1441 10 .macro Load_4Coeffs start next step mr 9, \start # j - add 10, 4, 9 # J + len*2 + add 10, 7, 9 # J + len*2 addi 16, 9, \next addi 17, 10, \step addi 18, 16, \next @@ -73,6 +78,8 @@ xxlor 32+3, 6, 6 # V_MKQ xxlor 32+1, 7, 7 # V_25 xxlor 32+2, 8, 8 # V_26 + # Multify Odd/Even signed halfword; + # Results word bound by 2^32 in abs value. vmulosh 6, 8, V20159 vmulesh 5, 8, V20159 vmulosh 11, 12, V20159 @@ -97,6 +104,8 @@ vadduwm 14, 14, V_25 vadduwm 17, 17, V_25 vadduwm 18, 18, V_25 + # Right shift and pack lower halfword, + # results bond to 2^16 in abs value vsraw 4, 4, V_26 vsraw 5, 5, V_26 vsraw 9, 9, V_26 @@ -113,6 +122,8 @@ vsubuhm 13, 7, 13 vpkuwum 17, 18, 17 vsubuhm 17, 7, 17 + # Modulo multify-Low unsigned halfword; + # results bond to 2^16 * q in abs value. vmladduhm \_v0, 4, V_MKQ, 8 vmladduhm \_v1, 9, V_MKQ, 12 vmladduhm \_v2, 13, V_MKQ, 16 @@ -123,11 +134,13 @@ # MREDUCE_4X(len, start, _vz0, _vz1, _vz2, _vz3) # .macro MREDUCE_4X _vz0 _vz1 _vz2 _vz3 _vo0 _vo1 _vo2 _vo3 + # Modular multification bond by 2^16 * q in abs value vmladduhm 15, 25, \_vz0, 3 vmladduhm 20, 26, \_vz1, 3 vmladduhm 27, 30, \_vz2, 3 vmladduhm 28, 31, \_vz3, 3 + # Signed multiply-high-round; outputs are bound by 2^15 * q in abs value vmhraddshs 14, 25, \_vz0, 3 vmhraddshs 19, 26, \_vz1, 3 vmhraddshs 24, 30, \_vz2, 3 @@ -265,50 +278,41 @@ MLK_ASM_FN_SYMBOL(intt_ppc) # init vectors and constants # Setup for Montgomery reduce - addis 8,2,.nmkq@toc@ha - addi 8,8,.nmkq@toc@l - lxv 0, 0(8) + lxv 0, 0(4) - lxv 32+V_QINV, 16(8) # QINV + lxv 32+V_QINV, QINV_OFFSET(4) # QINV xxlxor 32+3, 32+3, 32+3 vspltish 4, 1 - xxlor 2, 32+2, 32+2 - xxlor 3, 32+3, 32+3 - xxlor 4, 32+4, 32+4 + xxlor 2, 32+2, 32+2 # QINV + xxlor 3, 32+3, 32+3 # 0 + xxlor 4, 32+4, 32+4 # 1 # Setup for Barrett reduce - addis 8,2,.mkq@toc@ha - addi 8,8,.mkq@toc@l - addis 9,2,.C20159@toc@ha - addi 9,9,.C20159@toc@l - addis 10,2,.C25@toc@ha - addi 10,10,.C25@toc@l - - lxv 6, 0(8) # V_MKQ - lxv 32+0, 0(9) # V20159 - lxv 7, 0(10) # V_25 + lxv 6, Q_OFFSET(4) # V_MKQ + lxv 32+V20159, C20159_OFFSET(4) # V20159 + lxv 7, 0(4) # V_25 #xxspltiw 8, 26 # for power9 and above vspltisw 8, 13 vadduwm 8, 8, 8 - xxlor 8, 32+8, 32+8 + xxlor 8, 32+8, 32+8 # V_26 store at vs8 - # zetas array - #addis 14,2,.izeta63@toc@ha - #addi 14,14,.izeta63@toc@l + vspltisw 9, 1 + vsubuwm 10, 8, 9 # 25 + vslw 9, 9, 10 + xxlor 7, 32+9, 32+9 # V_25 syore at vs7 .align 4 -__Len2: +#__Len2: # # 1. len = 2, start = 0, 4, 8, 12,...244, 248, 252 # Update zetas vectors, each vector has 2 zetas - addis 14,2,.izeta127@toc@ha - addi 14,14,.izeta127@toc@l - li 4, 4 + addi 14, 4, IZETA_NTT_OFFSET127 + li 7, 4 li 15, 4 mtctr 15 li 5, 0 -__Loop2: +intt_ppc__Loop2: Load_4Coeffs 5, 16, 16 BREDUCE_4X 4, 9, 13, 17 xxlor 10, 32+4, 32+4 @@ -333,19 +337,18 @@ __Loop2: MREDUCE_4X V_Z0, V_Z1, V_Z2, V_Z3, 13, 18, 23, 28 Write_Len2_4C 32+13, 32+18, 32+23, 32+28 addi 5, 5, 64 - bdnz __Loop2 + bdnz intt_ppc__Loop2 .align 4 -__Len4: +#__Len4: # # 2. len = 4, start = 0, 8, 16, 24,...232, 240, 248 - addis 14,2,.izeta63@toc@ha - addi 14,14,.izeta63@toc@l + addi 14, 4, IZETA_NTT_OFFSET63 li 5, 0 - li 4, 8 + li 7, 8 li 15, 4 # loops mtctr 15 -__Loop4: +intt_ppc__Loop4: Load_4Coeffs 5, 16, 16 BREDUCE_4X 4, 9, 13, 17 xxlor 10, 32+4, 32+4 @@ -369,13 +372,13 @@ __Loop4: MREDUCE_4X V_Z0, V_Z1, V_Z2, V_Z3, 13, 18, 23, 28 Write_Len4_4C 32+13, 32+18, 32+23, 32+28 addi 5, 5, 64 - bdnz __Loop4 + bdnz intt_ppc__Loop4 .align 4 -__Len8: +#__Len8: # 3. len = 8, start = 0, 16, 32, 48,...208, 224, 240 #addi 14, 14, 512 - li 4, 16 + li 7, 16 li 5, 0 Load_4Coeffs 5, 32, 32 @@ -414,12 +417,12 @@ __Len8: Write_M4C 32+13, 32+18, 32+23, 32+28 .align 4 -__Len16: +#__Len16: # # 4. len = 16, start = 0, 32, 64,,...160, 192, 224 #addi 14, 14, 768 li 5, 0 - li 4, 32 + li 7, 32 Load_4Coeffs 5, 64, 64 BREDUCE_4X 4, 9, 13, 17 @@ -458,12 +461,12 @@ __Len16: Write_M4C 32+13, 32+18, 32+23, 32+28 .align 4 -__Len32: +#__Len32: # # 5. len = 32, start = 0, 64, 128, 192 #addi 14, 14, 896 li 5, 0 - li 4, 64 + li 7, 64 Load_4Coeffs 5, 16, 16 BREDUCE_4X 4, 9, 13, 17 @@ -505,12 +508,12 @@ __Len32: Write_M4C 32+13, 32+18, 32+23, 32+28 .align 4 -__Len64: +#__Len64: # # 6. len = 64, start = 0, 128 #addi 14, 14, 960 li 5, 0 - li 4, 128 + li 7, 128 Load_4Coeffs 5, 16, 16 BREDUCE_4X 4, 9, 13, 17 Write_B4C 32+4, 32+9, 32+13, 32+17 @@ -549,12 +552,12 @@ __Len64: Write_M4C 32+13, 32+18, 32+23, 32+28 .align 4 -__Len128: +#__Len128: # 7. len = 128, start = 0 # #addi 14, 14, 992 li 5, 0 # start - li 4, 256 # len * 2 + li 7, 256 # len * 2 Load_4Coeffs 5, 16, 16 BREDUCE_4X 4, 9, 13, 17 @@ -596,9 +599,8 @@ __Len128: # # Montgomery reduce loops with constant 1441 # - addis 10,2,.C1441@toc@ha - addi 10,10,.C1441@toc@l - lvx V1441, 0, 10 + addi 14, 4, C1441_OFFSET + lvx V1441, 0, 14 Reload_4coeffs MREDUCE_4X V1441, V1441, V1441, V1441, 6, 7, 8, 9 @@ -624,7 +626,6 @@ __Len128: MREDUCE_4X V1441, V1441, V1441, V1441, 13, 18, 23, 28 MWrite_8X 32+6, 32+7, 32+8, 32+9, 32+13, 32+18, 32+23, 32+28 -__intt_out: lxv 32+20, 128(1) lxv 32+21, 144(1) lxv 32+22, 160(1) @@ -651,123 +652,21 @@ __intt_out: addi 1, 1, 352 blr -.data -.align 4 -# -MLKEM_Q -.nmkq: -.short -3329, -3329, -3329, -3329, -3329, -3329, -3329, -3329 -# QINV -.short -3327, -3327, -3327, -3327, -3327, -3327, -3327, -3327 - -# MLKEM_Q -.mkq: -.short 3329, 3329, 3329, 3329, 3329, 3329, 3329, 3329 - -.C20159: -.short 20159, 20159, 20159, 20159, 20159, 20159, 20159, 20159 - -# 0x2000000 -.C25: -.long 33554432, 33554432, 33554432, 33554432 - -.C1441: -.short 1441, 1441, 1441, 1441, 1441, 1441, 1441, 1441 - -.align 4 -.izeta127: -.short 1628, 1628, 1628, 1628, 1522, 1522, 1522, 1522 -.short -1460, -1460, -1460, -1460, 958, 958, 958, 958 -.short 991, 991, 991, 991, 996, 996, 996, 996 -.short -308, -308, -308, -308, -108, -108, -108, -108 -.short 478, 478, 478, 478, -870, -870, -870, -870 -.short -854, -854, -854, -854, -1510, -1510, -1510, -1510 -.short 794, 794, 794, 794, -1278, -1278, -1278, -1278 -.short -1530, -1530, -1530, -1530, -1185, -1185, -1185, -1185 -.short -1659, -1659, -1659, -1659, -1187, -1187, -1187, -1187 -.short 220, 220, 220, 220, -874, -874, -874, -874 -.short -1335, -1335, -1335, -1335, 1218, 1218, 1218, 1218 -.short -136, -136, -136, -136, -1215, -1215, -1215, -1215 -.short 384, 384, 384, 384, -1465, -1465, -1465, -1465 -.short -1285, -1285, -1285, -1285, 1322, 1322, 1322, 1322 -.short 610, 610, 610, 610, 603, 603, 603, 603 -.short 1097, 1097, 1097, 1097, 817, 817, 817, 817 -.short -75, -75, -75, -75, -156, -156, -156, -156 -.short 329, 329, 329, 329, 418, 418, 418, 418 -.short 349, 349, 349, 349, -872, -872, -872, -872 -.short 644, 644, 644, 644, -1590, -1590, -1590, -1590 -.short 1119, 1119, 1119, 1119, -602, -602, -602, -602 -.short 1483, 1483, 1483, 1483, -777, -777, -777, -777 -.short -147, -147, -147, -147, 1159, 1159, 1159, 1159 -.short 778, 778, 778, 778, -246, -246, -246, -246 -.short 1653, 1653, 1653, 1653, 1574, 1574, 1574, 1574 -.short -460, -460, -460, -460, -291, -291, -291, -291 -.short -235, -235, -235, -235, 177, 177, 177, 177 -.short 587, 587, 587, 587, 422, 422, 422, 422 -.short 105, 105, 105, 105, 1550, 1550, 1550, 1550 -.short 871, 871, 871, 871, -1251, -1251, -1251, -1251 -.short 843, 843, 843, 843, 555, 555, 555, 555 -.short 430, 430, 430, 430, -1103, -1103, -1103, -1103 -.izeta63: -.short -1275, -1275, -1275, -1275, -1275, -1275, -1275, -1275 -.short 677, 677, 677, 677, 677, 677, 677, 677 -.short -1065, -1065, -1065, -1065, -1065, -1065, -1065, -1065 -.short 448, 448, 448, 448, 448, 448, 448, 448 -.short -725, -725, -725, -725, -725, -725, -725, -725 -.short -1508, -1508, -1508, -1508, -1508, -1508, -1508, -1508 -.short 961, 961, 961, 961, 961, 961, 961, 961 -.short -398, -398, -398, -398, -398, -398, -398, -398 -.short -951, -951, -951, -951, -951, -951, -951, -951 -.short -247, -247, -247, -247, -247, -247, -247, -247 -.short -1421, -1421, -1421, -1421, -1421, -1421, -1421, -1421 -.short 107, 107, 107, 107, 107, 107, 107, 107 -.short 830, 830, 830, 830, 830, 830, 830, 830 -.short -271, -271, -271, -271, -271, -271, -271, -271 -.short -90, -90, -90, -90, -90, -90, -90, -90 -.short -853, -853, -853, -853, -853, -853, -853, -853 -.short 1469, 1469, 1469, 1469, 1469, 1469, 1469, 1469 -.short 126, 126, 126, 126, 126, 126, 126, 126 -.short -1162, -1162, -1162, -1162, -1162, -1162, -1162, -1162 -.short -1618, -1618, -1618, -1618, -1618, -1618, -1618, -1618 -.short -666, -666, -666, -666, -666, -666, -666, -666 -.short -320, -320, -320, -320, -320, -320, -320, -320 -.short -8, -8, -8, -8, -8, -8, -8, -8 -.short 516, 516, 516, 516, 516, 516, 516, 516 -.short -1544, -1544, -1544, -1544, -1544, -1544, -1544, -1544 -.short -282, -282, -282, -282, -282, -282, -282, -282 -.short 1491, 1491, 1491, 1491, 1491, 1491, 1491, 1491 -.short -1293, -1293, -1293, -1293, -1293, -1293, -1293, -1293 -.short 1015, 1015, 1015, 1015, 1015, 1015, 1015, 1015 -.short -552, -552, -552, -552, -552, -552, -552, -552 -.short 652, 652, 652, 652, 652, 652, 652, 652 -.short 1223, 1223, 1223, 1223, 1223, 1223, 1223, 1223 -.short -1571, -1571, -1571, -1571, -1571, -1571, -1571, -1571 -.short -205, -205, -205, -205, -205, -205, -205, -205 -.short 411, 411, 411, 411, 411, 411, 411, 411 -.short -1542, -1542, -1542, -1542, -1542, -1542, -1542, -1542 -.short 608, 608, 608, 608, 608, 608, 608, 608 -.short 732, 732, 732, 732, 732, 732, 732, 732 -.short 1017, 1017, 1017, 1017, 1017, 1017, 1017, 1017 -.short -681, -681, -681, -681, -681, -681, -681, -681 -.short -130, -130, -130, -130, -130, -130, -130, -130 -.short -1602, -1602, -1602, -1602, -1602, -1602, -1602, -1602 -.short 1458, 1458, 1458, 1458, 1458, 1458, 1458, 1458 -.short -829, -829, -829, -829, -829, -829, -829, -829 -.short 383, 383, 383, 383, 383, 383, 383, 383 -.short 264, 264, 264, 264, 264, 264, 264, 264 -.short -1325, -1325, -1325, -1325, -1325, -1325, -1325, -1325 -.short 573, 573, 573, 573, 573, 573, 573, 573 -.short 1468, 1468, 1468, 1468, 1468, 1468, 1468, 1468 -.short -1474, -1474, -1474, -1474, -1474, -1474, -1474, -1474 -.short -1202, -1202, -1202, -1202, -1202, -1202, -1202, -1202 -.short 962, 962, 962, 962, 962, 962, 962, 962 -.short 182, 182, 182, 182, 182, 182, 182, 182 -.short 1577, 1577, 1577, 1577, 1577, 1577, 1577, 1577 -.short 622, 622, 622, 622, 622, 622, 622, 622 -.short -171, -171, -171, -171, -171, -171, -171, -171 -.short 202, 202, 202, 202, 202, 202, 202, 202 -.short 287, 287, 287, 287, 287, 287, 287, 287 -.short 1422, 1422, 1422, 1422, 1422, 1422, 1422, 1422 -.short 1493, 1493, 1493, 1493, 1493, 1493, 1493, 1493 -.short -1517, -1517, -1517, -1517, -1517, -1517, -1517, -1517 -.short -359, -359, -359, -359, -359, -359, -359, -359 -.short -758, -758, -758, -758, -758, -758, -758, -758 +/* To facilitate single-compilation-unit (SCU) builds, undefine all macros. + * Don't modify by hand -- this is auto-generated by scripts/autogen. */ +#undef V20159 +#undef V_25 +#undef V_26 +#undef V_MKQ +#undef V_QINV +#undef V_NMKQ +#undef V_Z0 +#undef V_Z1 +#undef V_Z2 +#undef V_Z3 +#undef V_ZETA +#undef V1441 + +/* simpasm: footer-start */ +#endif /* MLK_ARITH_BACKEND_PPC64LE_DEFAULT && \ + !MLK_CONFIG_MULTILEVEL_NO_SHARED */ diff --git a/dev/ppc64le/src/ntt_ppc.S b/dev/ppc64le/src/ntt_ppc.S index 172fef9cc..5bc1c34b8 100644 --- a/dev/ppc64le/src/ntt_ppc.S +++ b/dev/ppc64le/src/ntt_ppc.S @@ -11,14 +11,19 @@ # #include "../../../common.h" +#if defined(MLK_ARITH_BACKEND_PPC64LE_DEFAULT) && \ + !defined(MLK_CONFIG_MULTILEVEL_NO_SHARED) +/* simpasm: header-end */ -#define V_QINV 2 -#define V_NMKQ 5 -#define V_Z0 7 -#define V_Z1 8 -#define V_Z2 9 -#define V_Z3 10 -#define V_ZETA 10 +#include "consts.h" + +#define V_QINV 2 +#define V_NMKQ 5 +#define V_Z0 7 +#define V_Z1 8 +#define V_Z2 9 +#define V_Z3 10 +#define V_ZETA 10 .machine "any" .text @@ -33,7 +38,7 @@ # .macro MREDUCE_4X start next step _vz0 _vz1 _vz2 _vz3 mr 9, \start - add 10, 4, 9 # J + len*2 + add 10, 7, 9 # J + len*2 addi 16, 9, \next addi 17, 10, \step addi 18, 16, \next @@ -50,11 +55,13 @@ xxpermdi 32+28, 32+28, 32+28, 2 # fqmul = zeta * coefficient + # Modular multification bond by 2^16 * q in abs value vmladduhm 15, 13, \_vz0, 3 vmladduhm 20, 18, \_vz1, 3 vmladduhm 25, 23, \_vz2, 3 vmladduhm 30, 28, \_vz3, 3 + # Signed multiply-high-round; outputs are bound by 2^15 * q in abs value vmhraddshs 14, 13, \_vz0, 3 vmhraddshs 19, 18, \_vz1, 3 vmhraddshs 24, 23, \_vz2, 3 @@ -84,6 +91,9 @@ xxpermdi 32+22, 32+22, 32+22, 2 xxpermdi 32+27, 32+27, 32+27, 2 + # Since the result of the Montgomery multiplication is bounded + # by q in absolute value. + # Finally to complete the final update of the results with add/sub vsubuhm 16, 12, 13 # r - t vadduhm 15, 13, 12 # r + t vsubuhm 21, 17, 18 # r - t @@ -175,20 +185,18 @@ MLK_ASM_FN_SYMBOL(ntt_ppc) stxv 32+31, 304(1) # get MLKEM_Q - addis 8,2,.nmkq@toc@ha - addi 8,8,.nmkq@toc@l - lvx V_NMKQ,0,8 + lvx V_NMKQ,0,4 # zetas array - addis 14,2,.K1@toc@ha - addi 14,14,.K1@toc@l + addi 14, 4, ZETA_NTT_OFFSET vxor 3, 3, 3 vspltish 4, 1 - lxv 32+V_QINV, 16(8) + + lxv 32+V_QINV, QINV_OFFSET(4) .align 4 -__Len128: +#__Len128: # # Compute coefficients of the NTT based on the following loop. # for (len = 128; len ≥ 2; len = len/2) @@ -196,7 +204,7 @@ __Len128: # 1. len = 128, start = 0 # li 5, 0 # start - li 4, 256 # len * 2 + li 7, 256 # len * 2 lvx V_ZETA, 0, 14 addi 14, 14, 16 @@ -213,12 +221,12 @@ __Len128: Write_One .align 4 -__Len64: +#__Len64: # # 2. len = 64, start = 0, 128 # k += 2 li 5, 0 - li 4, 128 + li 7, 128 lvx V_ZETA, 0, 14 addi 14, 14, 16 MREDUCE_4X 5, 16, 16, V_ZETA, V_ZETA, V_ZETA, V_ZETA @@ -237,12 +245,12 @@ __Len64: Write_One .align 4 -__Len32: +#__Len32: # # 3. len = 32, start = 0, 64, 128, 192 # k += 4 li 5, 0 - li 4, 64 + li 7, 64 lvx V_ZETA, 0, 14 addi 14, 14, 16 MREDUCE_4X 5, 16, 16, V_ZETA, V_ZETA, V_ZETA, V_ZETA @@ -270,12 +278,12 @@ __Len32: Write_One .align 4 -__Len16: +#__Len16: # # 4. len = 16, start = 0, 32, 64,,...160, 192, 224 # k += 8 li 5, 0 - li 4, 32 + li 7, 32 Load_next_4zetas MREDUCE_4X 5, 64, 64, V_Z0, V_Z1, V_Z2, V_Z3 Write_One @@ -292,12 +300,12 @@ __Len16: Write_One .align 4 -__Len8: +#__Len8: # # 5. len = 8, start = 0, 16, 32, 48,...208, 224, 240 # k += 16 li 5, 0 - li 4, 16 + li 7, 16 Load_next_4zetas MREDUCE_4X 5, 32, 32, V_Z0, V_Z1, V_Z2, V_Z3 Write_One @@ -323,9 +331,9 @@ __Len8: li 15, 4 # loops mtctr 15 li 5, 0 - li 4, 8 + li 7, 8 .align 4 -__Len4: +ntt_ppc__Len4: Load_next_4zetas MREDUCE_4X 5, 16, 16, V_Z0, V_Z1, V_Z2, V_Z3 Write_Two @@ -336,21 +344,21 @@ __Len4: Write_Two addi 5, 5, 64 - bdnz __Len4 + bdnz ntt_ppc__Len4 # # 7. len = 2, start = 0, 4, 8, 12,...244, 248, 252 # k += 64 # Update zetas vectors, each vector has 2 zetas - addis 14,2,.K64@toc@ha - addi 14,14,.K64@toc@l + + addi 14, 4, ZETA_NTT_OFFSET64 li 15, 4 mtctr 15 li 5, 0 - li 4, 4 + li 7, 4 .align 4 -__Len2: +ntt_ppc__Len2: Load_next_4zetas MREDUCE_4X 5, 16, 16, V_Z0, V_Z1, V_Z2, V_Z3 Write_Three @@ -361,9 +369,8 @@ __Len2: Write_Three addi 5, 5, 64 - bdnz __Len2 + bdnz ntt_ppc__Len2 -__ntt_out: lxv 32+20, 128(1) lxv 32+21, 144(1) lxv 32+22, 160(1) @@ -390,109 +397,12 @@ __ntt_out: addi 1, 1, 352 blr -.data -.align 4 -# -MLKEM_Q -.nmkq: -.short -3329, -3329, -3329, -3329, -3329, -3329, -3329, -3329 -# QINV -.short -3327, -3327, -3327, -3327, -3327, -3327, -3327, -3327 - -# zetas -.K1: -.short -758, -758, -758, -758, -758, -758, -758, -758 -.short -359, -359, -359, -359, -359, -359, -359, -359 -.short -1517, -1517, -1517, -1517, -1517, -1517, -1517, -1517 -.short 1493, 1493, 1493, 1493, 1493, 1493, 1493, 1493 -.short 1422, 1422, 1422, 1422, 1422, 1422, 1422, 1422 -.short 287, 287, 287, 287, 287, 287, 287, 287 -.short 202, 202, 202, 202, 202, 202, 202, 202 -.short -171, -171, -171, -171, -171, -171, -171, -171 -.short 622, 622, 622, 622, 622, 622, 622, 622 -.short 1577, 1577, 1577, 1577, 1577, 1577, 1577, 1577 -.short 182, 182, 182, 182, 182, 182, 182, 182 -.short 962, 962, 962, 962, 962, 962, 962, 962 -.short -1202, -1202, -1202, -1202, -1202, -1202, -1202, -1202 -.short -1474, -1474, -1474, -1474, -1474, -1474, -1474, -1474 -.short 1468, 1468, 1468, 1468, 1468, 1468, 1468, 1468 -.short 573, 573, 573, 573, 573, 573, 573, 573 -.short -1325, -1325, -1325, -1325, -1325, -1325, -1325, -1325 -.short 264, 264, 264, 264, 264, 264, 264, 264 -.short 383, 383, 383, 383, 383, 383, 383, 383 -.short -829, -829, -829, -829, -829, -829, -829, -829 -.short 1458, 1458, 1458, 1458, 1458, 1458, 1458, 1458 -.short -1602, -1602, -1602, -1602, -1602, -1602, -1602, -1602 -.short -130, -130, -130, -130, -130, -130, -130, -130 -.short -681, -681, -681, -681, -681, -681, -681, -681 -.short 1017, 1017, 1017, 1017, 1017, 1017, 1017, 1017 -.short 732, 732, 732, 732, 732, 732, 732, 732 -.short 608, 608, 608, 608, 608, 608, 608, 608 -.short -1542, -1542, -1542, -1542, -1542, -1542, -1542, -1542 -.short 411, 411, 411, 411, 411, 411, 411, 411 -.short -205, -205, -205, -205, -205, -205, -205, -205 -.short -1571, -1571, -1571, -1571, -1571, -1571, -1571, -1571 -.short 1223, 1223, 1223, 1223, 1223, 1223, 1223, 1223 -.short 652, 652, 652, 652, 652, 652, 652, 652 -.short -552, -552, -552, -552, -552, -552, -552, -552 -.short 1015, 1015, 1015, 1015, 1015, 1015, 1015, 1015 -.short -1293, -1293, -1293, -1293, -1293, -1293, -1293, -1293 -.short 1491, 1491, 1491, 1491, 1491, 1491, 1491, 1491 -.short -282, -282, -282, -282, -282, -282, -282, -282 -.short -1544, -1544, -1544, -1544, -1544, -1544, -1544, -1544 -.short 516, 516, 516, 516, 516, 516, 516, 516 -.short -8, -8, -8, -8, -8, -8, -8, -8 -.short -320, -320, -320, -320, -320, -320, -320, -320 -.short -666, -666, -666, -666, -666, -666, -666, -666 -.short -1618, -1618, -1618, -1618, -1618, -1618, -1618, -1618 -.short -1162, -1162, -1162, -1162, -1162, -1162, -1162, -1162 -.short 126, 126, 126, 126, 126, 126, 126, 126 -.short 1469, 1469, 1469, 1469, 1469, 1469, 1469, 1469 -.short -853, -853, -853, -853, -853, -853, -853, -853 -.short -90, -90, -90, -90, -90, -90, -90, -90 -.short -271, -271, -271, -271, -271, -271, -271, -271 -.short 830, 830, 830, 830, 830, 830, 830, 830 -.short 107, 107, 107, 107, 107, 107, 107, 107 -.short -1421, -1421, -1421, -1421, -1421, -1421, -1421, -1421 -.short -247, -247, -247, -247, -247, -247, -247, -247 -.short -951, -951, -951, -951, -951, -951, -951, -951 -.short -398, -398, -398, -398, -398, -398, -398, -398 -.short 961, 961, 961, 961, 961, 961, 961, 961 -.short -1508, -1508, -1508, -1508, -1508, -1508, -1508, -1508 -.short -725, -725, -725, -725, -725, -725, -725, -725 -.short 448, 448, 448, 448, 448, 448, 448, 448 -.short -1065, -1065, -1065, -1065, -1065, -1065, -1065, -1065 -.short 677, 677, 677, 677, 677, 677, 677, 677 -.short -1275, -1275, -1275, -1275, -1275, -1275, -1275, -1275 -.K64: -.short -1103, -1103, -1103, -1103, 430, 430, 430, 430 -.short 555, 555, 555, 555, 843, 843, 843, 843 -.short -1251, -1251, -1251, -1251, 871, 871, 871, 871 -.short 1550, 1550, 1550, 1550, 105, 105, 105, 105 -.short 422, 422, 422, 422, 587, 587, 587, 587 -.short 177, 177, 177, 177, -235, -235, -235, -235 -.short -291, -291, -291, -291, -460, -460, -460, -460 -.short 1574, 1574, 1574, 1574, 1653, 1653, 1653, 1653 -.short -246, -246, -246, -246, 778, 778, 778, 778 -.short 1159, 1159, 1159, 1159, -147, -147, -147, -147 -.short -777, -777, -777, -777, 1483, 1483, 1483, 1483 -.short -602, -602, -602, -602, 1119, 1119, 1119, 1119 -.short -1590, -1590, -1590, -1590, 644, 644, 644, 644 -.short -872, -872, -872, -872, 349, 349, 349, 349 -.short 418, 418, 418, 418, 329, 329, 329, 329 -.short -156, -156, -156, -156, -75, -75, -75, -75 -.short 817, 817, 817, 817, 1097, 1097, 1097, 1097 -.short 603, 603, 603, 603, 610, 610, 610, 610 -.short 1322, 1322, 1322, 1322, -1285, -1285, -1285, -1285 -.short -1465, -1465, -1465, -1465, 384, 384, 384, 384 -.short -1215, -1215, -1215, -1215, -136, -136, -136, -136 -.short 1218, 1218, 1218, 1218, -1335, -1335, -1335, -1335 -.short -874, -874, -874, -874, 220, 220, 220, 220 -.short -1187, -1187, -1187, -1187, -1659, -1659, -1659, -1659 -.short -1185, -1185, -1185, -1185, -1530, -1530, -1530, -1530 -.short -1278, -1278, -1278, -1278, 794, 794, 794, 794 -.short -1510, -1510, -1510, -1510, -854, -854, -854, -854 -.short -870, -870, -870, -870, 478, 478, 478, 478 -.short -108, -108, -108, -108, -308, -308, -308, -308 -.short 996, 996, 996, 996, 991, 991, 991, 991 -.short 958, 958, 958, 958, -1460, -1460, -1460, -1460 -.short 1522, 1522, 1522, 1522, 1628, 1628, 1628, 1628 +/* To facilitate single-compilation-unit (SCU) builds, undefine all macros. + * Don't modify by hand -- this is auto-generated by scripts/autogen. */ +#undef V_QINV +#undef V_NMKQ +#undef V_ZETA + +/* simpasm: footer-start */ +#endif /* MLK_ARITH_BACKEND_PPC64LE_DEFAULT && \ + !MLK_CONFIG_MULTILEVEL_NO_SHARED */ diff --git a/dev/ppc64le/src/poly_tomont.S b/dev/ppc64le/src/poly_tomont.S index c07f25c5a..b7b010aaf 100644 --- a/dev/ppc64le/src/poly_tomont.S +++ b/dev/ppc64le/src/poly_tomont.S @@ -17,8 +17,13 @@ # #include "../../../common.h" +#if defined(MLK_ARITH_BACKEND_PPC64LE_DEFAULT) && \ + !defined(MLK_CONFIG_MULTILEVEL_NO_SHARED) +/* simpasm: header-end */ -#define V1353 0 +#include "consts.h" + +#define V1353 0 #define V_QINV 2 #define V_NMKQ 5 @@ -98,14 +103,9 @@ MLK_ASM_FN_SYMBOL(poly_tomont_ppc) stxv 32+29, 272(1) stxv 32+30, 288(1) - addis 9,2,.nmkq@toc@ha - addi 9,9,.nmkq@toc@l - addis 10,2,.C1353@toc@ha - addi 10,10,.C1353@toc@l - - lxv 32+V_NMKQ,0(9) - lxv 32+V_QINV,16(9) - lxv 32+V1353,0(10) + lxv 32+V_NMKQ, NQ_OFFSET(4) + lxv 32+V_QINV, QINV_OFFSET(4) + lxv 32+V1353, C1353_OFFSET(4) vxor 3, 3, 3 vspltish 4, 1 @@ -150,14 +150,12 @@ MLK_ASM_FN_SYMBOL(poly_tomont_ppc) addi 1, 1, 320 blr -.data -.align 4 -# -MLKEM_Q -.nmkq: -.short -3329, -3329, -3329, -3329, -3329, -3329, -3329, -3329 -# QINV -.short -3327, -3327, -3327, -3327, -3327, -3327, -3327, -3327 - -.C1353: -.short 1353, 1353, 1353, 1353, 1353, 1353, 1353, 1353 +/* To facilitate single-compilation-unit (SCU) builds, undefine all macros. + * Don't modify by hand -- this is auto-generated by scripts/autogen. */ +#undef V1353 +#undef V_QINV +#undef V_NMKQ +/* simpasm: footer-start */ +#endif /* MLK_ARITH_BACKEND_PPC64LE_DEFAULT && \ + !MLK_CONFIG_MULTILEVEL_NO_SHARED */ diff --git a/dev/ppc64le/src/reduce.S b/dev/ppc64le/src/reduce.S index ee8e1fdca..dfb634392 100644 --- a/dev/ppc64le/src/reduce.S +++ b/dev/ppc64le/src/reduce.S @@ -18,10 +18,15 @@ # #include "../../../common.h" +#if defined(MLK_ARITH_BACKEND_PPC64LE_DEFAULT) && \ + !defined(MLK_CONFIG_MULTILEVEL_NO_SHARED) +/* simpasm: header-end */ + +#include "consts.h" # Barrett reduce constatnts #define V20159 0 -#define V_25 1 +#define V_25 1 #define V_26 2 #define V_MKQ 3 @@ -136,18 +141,16 @@ MLK_ASM_FN_SYMBOL(reduce_ppc) stxv 32+23, 176(1) stxv 32+24, 192(1) - addis 8,2,.mkq@toc@ha - addi 8,8,.mkq@toc@l - addis 9,2,.C20159@toc@ha - addi 9,9,.C20159@toc@l - addis 10,2,.C25@toc@ha - addi 10,10,.C25@toc@l - vxor 7, 7, 7 - lxv 32+V_MKQ, 0(8) - lxv 32+V20159, 0(9) - lxv 32+V_25, 0(10) + lxv 32+V_MKQ, Q_OFFSET(4) + lxv 32+V20159, C20159_OFFSET(4) + + vspltisw V_26, 13 + vadduwm V_26, V_26, V_26 + vspltisw 4, 1 + vsubuwm 5, V_26, 4 + vslw V_25, 4, 5 li 4, -128 li 5, -112 @@ -162,9 +165,6 @@ MLK_ASM_FN_SYMBOL(reduce_ppc) li 15, 32 li 16, 48 - vspltisw V_26, 13 - vadduwm V_26, V_26, V_26 - BREDUCE_4X 21, 22, 23, 24 BREDUCE_4X 4, 9, 13, 17 Write_8X @@ -211,15 +211,13 @@ MLK_ASM_FN_SYMBOL(reduce_ppc) addi 1, 1, 224 blr -.align 4 -.data -# MLKEM_Q -.mkq: -.short 3329, 3329, 3329, 3329, 3329, 3329, 3329, 3329 - -.C20159: -.short 20159, 20159, 20159, 20159, 20159, 20159, 20159, 20159 +/* To facilitate single-compilation-unit (SCU) builds, undefine all macros. + * Don't modify by hand -- this is auto-generated by scripts/autogen. */ +#undef V20159 +#undef V_25 +#undef V_26 +#undef V_MKQ -# 0x2000000 -.C25: -.long 33554432, 33554432, 33554432, 33554432 +/* simpasm: footer-start */ +#endif /* MLK_ARITH_BACKEND_PPC64LE_DEFAULT && \ + !MLK_CONFIG_MULTILEVEL_NO_SHARED */ diff --git a/integration/liboqs/ML-KEM-1024_META.yml b/integration/liboqs/ML-KEM-1024_META.yml index c3ffce4e6..9c7fe672a 100644 --- a/integration/liboqs/ML-KEM-1024_META.yml +++ b/integration/liboqs/ML-KEM-1024_META.yml @@ -97,7 +97,13 @@ implementations: signature_keypair_derand: PQCP_MLKEM_NATIVE_MLKEM1024_PPC64LE_keypair_derand signature_enc: PQCP_MLKEM_NATIVE_MLKEM1024_PPC64LE_enc signature_dec: PQCP_MLKEM_NATIVE_MLKEM1024_PPC64LE_dec - sources: integration/liboqs/config_ppc64le.h mlkem/src/cbmc.h mlkem/src/common.h mlkem/src/compress.c mlkem/src/compress.h mlkem/src/debug.c mlkem/src/debug.h mlkem/src/indcpa.c mlkem/src/indcpa.h mlkem/src/kem.c mlkem/src/kem.h mlkem/src/native/api.h mlkem/src/native/meta.h mlkem/src/native/aarch64 mlkem/src/params.h mlkem/src/poly.c mlkem/src/poly.h mlkem/src/poly_k.c mlkem/src/poly_k.h mlkem/src/randombytes.h mlkem/src/sampling.c mlkem/src/sampling.h mlkem/src/symmetric.h mlkem/src/sys.h mlkem/src/verify.c mlkem/src/verify.h mlkem/src/zetas.inc + sources: integration/liboqs/config_ppc64le.h integration/liboqs/fips202_glue.h integration/liboqs/fips202x4_glue.h + mlkem/src/cbmc.h mlkem/src/common.h mlkem/src/compress.c mlkem/src/compress.h + mlkem/src/debug.c mlkem/src/debug.h mlkem/src/indcpa.c mlkem/src/indcpa.h mlkem/src/kem.c + mlkem/src/kem.h mlkem/src/native/api.h mlkem/src/native/meta.h mlkem/src/params.h + mlkem/src/poly.c mlkem/src/poly.h mlkem/src/poly_k.c mlkem/src/poly_k.h mlkem/src/randombytes.h + mlkem/src/sampling.c mlkem/src/sampling.h mlkem/src/symmetric.h mlkem/src/sys.h + mlkem/src/verify.c mlkem/src/verify.h mlkem/src/zetas.inc mlkem/src/native/ppc64le supported_platforms: - architecture: ppc64le operating_systems: diff --git a/integration/liboqs/ML-KEM-512_META.yml b/integration/liboqs/ML-KEM-512_META.yml index c5fb05e60..f46dbfdbf 100644 --- a/integration/liboqs/ML-KEM-512_META.yml +++ b/integration/liboqs/ML-KEM-512_META.yml @@ -97,7 +97,13 @@ implementations: signature_keypair_derand: PQCP_MLKEM_NATIVE_MLKEM512_PPC64LE_keypair_derand signature_enc: PQCP_MLKEM_NATIVE_MLKEM512_PPC64LE_enc signature_dec: PQCP_MLKEM_NATIVE_MLKEM512_PPC64LE_dec - sources: integration/liboqs/config_ppc64le.h mlkem/src/cbmc.h mlkem/src/common.h mlkem/src/compress.c mlkem/src/compress.h mlkem/src/debug.c mlkem/src/debug.h mlkem/src/indcpa.c mlkem/src/indcpa.h mlkem/src/kem.c mlkem/src/kem.h mlkem/src/native/api.h mlkem/src/native/meta.h mlkem/src/native/aarch64 mlkem/src/params.h mlkem/src/poly.c mlkem/src/poly.h mlkem/src/poly_k.c mlkem/src/poly_k.h mlkem/src/randombytes.h mlkem/src/sampling.c mlkem/src/sampling.h mlkem/src/symmetric.h mlkem/src/sys.h mlkem/src/verify.c mlkem/src/verify.h mlkem/src/zetas.inc + sources: integration/liboqs/config_ppc64le.h integration/liboqs/fips202_glue.h integration/liboqs/fips202x4_glue.h + mlkem/src/cbmc.h mlkem/src/common.h mlkem/src/compress.c mlkem/src/compress.h + mlkem/src/debug.c mlkem/src/debug.h mlkem/src/indcpa.c mlkem/src/indcpa.h mlkem/src/kem.c + mlkem/src/kem.h mlkem/src/native/api.h mlkem/src/native/meta.h mlkem/src/params.h + mlkem/src/poly.c mlkem/src/poly.h mlkem/src/poly_k.c mlkem/src/poly_k.h mlkem/src/randombytes.h + mlkem/src/sampling.c mlkem/src/sampling.h mlkem/src/symmetric.h mlkem/src/sys.h + mlkem/src/verify.c mlkem/src/verify.h mlkem/src/zetas.inc mlkem/src/native/ppc64le supported_platforms: - architecture: ppc64le operating_systems: diff --git a/integration/liboqs/ML-KEM-768_META.yml b/integration/liboqs/ML-KEM-768_META.yml index 80b05ba45..1b01c4d42 100644 --- a/integration/liboqs/ML-KEM-768_META.yml +++ b/integration/liboqs/ML-KEM-768_META.yml @@ -97,7 +97,13 @@ implementations: signature_keypair_derand: PQCP_MLKEM_NATIVE_MLKEM768_PPC64LE_keypair_derand signature_enc: PQCP_MLKEM_NATIVE_MLKEM768_PPC64LE_enc signature_dec: PQCP_MLKEM_NATIVE_MLKEM768_PPC64LE_dec - sources: integration/liboqs/config_ppc64le.h mlkem/src/cbmc.h mlkem/src/common.h mlkem/src/compress.c mlkem/src/compress.h mlkem/src/debug.c mlkem/src/debug.h mlkem/src/indcpa.c mlkem/src/indcpa.h mlkem/src/kem.c mlkem/src/kem.h mlkem/src/native/api.h mlkem/src/native/meta.h mlkem/src/native/aarch64 mlkem/src/params.h mlkem/src/poly.c mlkem/src/poly.h mlkem/src/poly_k.c mlkem/src/poly_k.h mlkem/src/randombytes.h mlkem/src/sampling.c mlkem/src/sampling.h mlkem/src/symmetric.h mlkem/src/sys.h mlkem/src/verify.c mlkem/src/verify.h mlkem/src/zetas.inc + sources: integration/liboqs/config_ppc64le.h integration/liboqs/fips202_glue.h integration/liboqs/fips202x4_glue.h + mlkem/src/cbmc.h mlkem/src/common.h mlkem/src/compress.c mlkem/src/compress.h + mlkem/src/debug.c mlkem/src/debug.h mlkem/src/indcpa.c mlkem/src/indcpa.h mlkem/src/kem.c + mlkem/src/kem.h mlkem/src/native/api.h mlkem/src/native/meta.h mlkem/src/params.h + mlkem/src/poly.c mlkem/src/poly.h mlkem/src/poly_k.c mlkem/src/poly_k.h mlkem/src/randombytes.h + mlkem/src/sampling.c mlkem/src/sampling.h mlkem/src/symmetric.h mlkem/src/sys.h + mlkem/src/verify.c mlkem/src/verify.h mlkem/src/zetas.inc mlkem/src/native/ppc64le supported_platforms: - architecture: ppc64le operating_systems: diff --git a/mlkem/mlkem_native.S b/mlkem/mlkem_native.S index b74591221..a12940785 100644 --- a/mlkem/mlkem_native.S +++ b/mlkem/mlkem_native.S @@ -457,6 +457,33 @@ #undef MLK_NTT_BOUND /* mlkem/src/native/meta.h */ #undef MLK_NATIVE_META_H +/* mlkem/src/native/ppc64le/meta.h */ +#undef MLK_ARITH_BACKEND_NAME +#undef MLK_ARITH_BACKEND_PPC64LE_DEFAULT +#undef MLK_NATIVE_PPC64LE_META_H +#undef MLK_USE_NATIVE_INTT +#undef MLK_USE_NATIVE_NTT +#undef MLK_USE_NATIVE_POLY_REDUCE +#undef MLK_USE_NATIVE_POLY_TOMONT +/* mlkem/src/native/ppc64le/src/arith_native_ppc64le.h */ +#undef MLK_NATIVE_PPC64LE_SRC_ARITH_NATIVE_PPC64LE_H +#undef mlk_intt_ppc +#undef mlk_ntt_ppc +#undef mlk_poly_tomont_ppc +#undef mlk_reduce_ppc +/* mlkem/src/native/ppc64le/src/consts.h */ +#undef C1353_OFFSET +#undef C1441_OFFSET +#undef C20159_OFFSET +#undef IZETA_NTT_OFFSET127 +#undef IZETA_NTT_OFFSET63 +#undef MLK_NATIVE_PPC64LE_SRC_CONSTS_H +#undef NQ_OFFSET +#undef QINV_OFFSET +#undef Q_OFFSET +#undef ZETA_NTT_OFFSET +#undef ZETA_NTT_OFFSET64 +#undef mlk_ppc_qdata #if defined(MLK_SYS_AARCH64) /* * Undefine macros from native code (Arith, AArch64) diff --git a/mlkem/mlkem_native.c b/mlkem/mlkem_native.c index 51bc1e33e..18501942e 100644 --- a/mlkem/mlkem_native.c +++ b/mlkem/mlkem_native.c @@ -444,6 +444,33 @@ #undef MLK_NTT_BOUND /* mlkem/src/native/meta.h */ #undef MLK_NATIVE_META_H +/* mlkem/src/native/ppc64le/meta.h */ +#undef MLK_ARITH_BACKEND_NAME +#undef MLK_ARITH_BACKEND_PPC64LE_DEFAULT +#undef MLK_NATIVE_PPC64LE_META_H +#undef MLK_USE_NATIVE_INTT +#undef MLK_USE_NATIVE_NTT +#undef MLK_USE_NATIVE_POLY_REDUCE +#undef MLK_USE_NATIVE_POLY_TOMONT +/* mlkem/src/native/ppc64le/src/arith_native_ppc64le.h */ +#undef MLK_NATIVE_PPC64LE_SRC_ARITH_NATIVE_PPC64LE_H +#undef mlk_intt_ppc +#undef mlk_ntt_ppc +#undef mlk_poly_tomont_ppc +#undef mlk_reduce_ppc +/* mlkem/src/native/ppc64le/src/consts.h */ +#undef C1353_OFFSET +#undef C1441_OFFSET +#undef C20159_OFFSET +#undef IZETA_NTT_OFFSET127 +#undef IZETA_NTT_OFFSET63 +#undef MLK_NATIVE_PPC64LE_SRC_CONSTS_H +#undef NQ_OFFSET +#undef QINV_OFFSET +#undef Q_OFFSET +#undef ZETA_NTT_OFFSET +#undef ZETA_NTT_OFFSET64 +#undef mlk_ppc_qdata #if defined(MLK_SYS_AARCH64) /* * Undefine macros from native code (Arith, AArch64) diff --git a/mlkem/src/native/meta.h b/mlkem/src/native/meta.h index 7fdcd6fcf..e39188323 100644 --- a/mlkem/src/native/meta.h +++ b/mlkem/src/native/meta.h @@ -20,6 +20,6 @@ #ifdef MLK_SYS_PPC64LE #include "ppc64le/meta.h" -#endif /* MLK_SYS_PPC64LE */ +#endif #endif /* !MLK_NATIVE_META_H */ diff --git a/mlkem/src/native/ppc64le/meta.h b/mlkem/src/native/ppc64le/meta.h index bee788976..54b3ddd9c 100644 --- a/mlkem/src/native/ppc64le/meta.h +++ b/mlkem/src/native/ppc64le/meta.h @@ -25,25 +25,29 @@ #include "../api.h" #include "src/arith_native_ppc64le.h" -static MLK_INLINE int mlk_ntt_native(int16_t data[MLKEM_N]) { - mlk_ntt_ppc(data); - return MLK_NATIVE_FUNC_SUCCESS; +static MLK_INLINE int mlk_ntt_native(int16_t data[MLKEM_N]) +{ + mlk_ntt_ppc(data, mlk_ppc_qdata); + return MLK_NATIVE_FUNC_SUCCESS; } -static MLK_INLINE int mlk_intt_native(int16_t data[MLKEM_N]) { - mlk_intt_ppc(data); - return MLK_NATIVE_FUNC_SUCCESS; +static MLK_INLINE int mlk_intt_native(int16_t data[MLKEM_N]) +{ + mlk_intt_ppc(data, mlk_ppc_qdata); + return MLK_NATIVE_FUNC_SUCCESS; } -static MLK_INLINE int mlk_poly_reduce_native(int16_t data[MLKEM_N]) { - mlk_reduce_ppc(data); - return MLK_NATIVE_FUNC_SUCCESS; +static MLK_INLINE int mlk_poly_reduce_native(int16_t data[MLKEM_N]) +{ + mlk_reduce_ppc(data, mlk_ppc_qdata); + return MLK_NATIVE_FUNC_SUCCESS; } -static MLK_INLINE int mlk_poly_tomont_native(int16_t data[MLKEM_N]) { - mlk_poly_tomont_ppc(data); - return MLK_NATIVE_FUNC_SUCCESS; +static MLK_INLINE int mlk_poly_tomont_native(int16_t data[MLKEM_N]) +{ + mlk_poly_tomont_ppc(data, mlk_ppc_qdata); + return MLK_NATIVE_FUNC_SUCCESS; } #endif /* !__ASSEMBLER__ */ -#endif /* MLK_NATIVE_PPC64LE_META_H */ +#endif /* !MLK_NATIVE_PPC64LE_META_H */ diff --git a/mlkem/src/native/ppc64le/src/arith_native_ppc64le.h b/mlkem/src/native/ppc64le/src/arith_native_ppc64le.h index 57f0b8f8c..dbcee3e3e 100644 --- a/mlkem/src/native/ppc64le/src/arith_native_ppc64le.h +++ b/mlkem/src/native/ppc64le/src/arith_native_ppc64le.h @@ -7,17 +7,18 @@ #include #include "../../../common.h" +#include "consts.h" #define mlk_ntt_ppc MLK_NAMESPACE(ntt_ppc) -void mlk_ntt_ppc(int16_t *); +void mlk_ntt_ppc(int16_t *, const int16_t *); #define mlk_intt_ppc MLK_NAMESPACE(intt_ppc) -void mlk_intt_ppc(int16_t *); +void mlk_intt_ppc(int16_t *, const int16_t *); #define mlk_reduce_ppc MLK_NAMESPACE(reduce_ppc) -void mlk_reduce_ppc(int16_t *r); +void mlk_reduce_ppc(int16_t *r, const int16_t *); #define mlk_poly_tomont_ppc MLK_NAMESPACE(poly_tomont_ppc) -void mlk_poly_tomont_ppc(int16_t *); +void mlk_poly_tomont_ppc(int16_t *, const int16_t *); -#endif /* MLK_NATIVE_PPC64LE_SRC_ARITH_NATIVE_PPC64LE_H */ +#endif /* !MLK_NATIVE_PPC64LE_SRC_ARITH_NATIVE_PPC64LE_H */ diff --git a/mlkem/src/native/ppc64le/src/consts.c b/mlkem/src/native/ppc64le/src/consts.c new file mode 100644 index 000000000..4c2fbdf61 --- /dev/null +++ b/mlkem/src/native/ppc64le/src/consts.c @@ -0,0 +1,155 @@ +/* + * Copyright (c) The mlkem-native project authors + * SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT + */ + +#include "../../../common.h" + +#if defined(MLK_ARITH_BACKEND_PPC64LE_DEFAULT) && \ + !defined(MLK_CONFIG_MULTILEVEL_NO_SHARED) + +#include "consts.h" + +MLK_ALIGN const int16_t mlk_ppc_qdata[1568] = { + /* -Q */ + -3329, -3329, -3329, -3329, -3329, -3329, -3329, -3329, + /* QINV */ + -3327, -3327, -3327, -3327, -3327, -3327, -3327, -3327, + /* Q */ + 3329, 3329, 3329, 3329, 3329, 3329, 3329, 3329, + /* const 20159 for reduce.S and intt */ + 20159, 20159, 20159, 20159, 20159, 20159, 20159, 20159, + /* const 1441 for intt */ + 1441, 1441, 1441, 1441, 1441, 1441, 1441, 1441, + /* for poly_tomont.S */ + 1353, 1353, 1353, 1353, 1353, 1353, 1353, 1353, + /* zetas */ + /* For ntt Len=128, offset 96 */ + -758, -758, -758, -758, -758, -758, -758, -758, -359, -359, -359, -359, + -359, -359, -359, -359, -1517, -1517, -1517, -1517, -1517, -1517, -1517, + -1517, 1493, 1493, 1493, 1493, 1493, 1493, 1493, 1493, 1422, 1422, 1422, + 1422, 1422, 1422, 1422, 1422, 287, 287, 287, 287, 287, 287, 287, 287, 202, + 202, 202, 202, 202, 202, 202, 202, -171, -171, -171, -171, -171, -171, -171, + -171, 622, 622, 622, 622, 622, 622, 622, 622, 1577, 1577, 1577, 1577, 1577, + 1577, 1577, 1577, 182, 182, 182, 182, 182, 182, 182, 182, 962, 962, 962, + 962, 962, 962, 962, 962, -1202, -1202, -1202, -1202, -1202, -1202, -1202, + -1202, -1474, -1474, -1474, -1474, -1474, -1474, -1474, -1474, 1468, 1468, + 1468, 1468, 1468, 1468, 1468, 1468, 573, 573, 573, 573, 573, 573, 573, 573, + -1325, -1325, -1325, -1325, -1325, -1325, -1325, -1325, 264, 264, 264, 264, + 264, 264, 264, 264, 383, 383, 383, 383, 383, 383, 383, 383, -829, -829, + -829, -829, -829, -829, -829, -829, 1458, 1458, 1458, 1458, 1458, 1458, + 1458, 1458, -1602, -1602, -1602, -1602, -1602, -1602, -1602, -1602, -130, + -130, -130, -130, -130, -130, -130, -130, -681, -681, -681, -681, -681, + -681, -681, -681, 1017, 1017, 1017, 1017, 1017, 1017, 1017, 1017, 732, 732, + 732, 732, 732, 732, 732, 732, 608, 608, 608, 608, 608, 608, 608, 608, -1542, + -1542, -1542, -1542, -1542, -1542, -1542, -1542, 411, 411, 411, 411, 411, + 411, 411, 411, -205, -205, -205, -205, -205, -205, -205, -205, -1571, -1571, + -1571, -1571, -1571, -1571, -1571, -1571, 1223, 1223, 1223, 1223, 1223, + 1223, 1223, 1223, 652, 652, 652, 652, 652, 652, 652, 652, -552, -552, -552, + -552, -552, -552, -552, -552, 1015, 1015, 1015, 1015, 1015, 1015, 1015, + 1015, -1293, -1293, -1293, -1293, -1293, -1293, -1293, -1293, 1491, 1491, + 1491, 1491, 1491, 1491, 1491, 1491, -282, -282, -282, -282, -282, -282, + -282, -282, -1544, -1544, -1544, -1544, -1544, -1544, -1544, -1544, 516, + 516, 516, 516, 516, 516, 516, 516, -8, -8, -8, -8, -8, -8, -8, -8, -320, + -320, -320, -320, -320, -320, -320, -320, -666, -666, -666, -666, -666, + -666, -666, -666, -1618, -1618, -1618, -1618, -1618, -1618, -1618, -1618, + -1162, -1162, -1162, -1162, -1162, -1162, -1162, -1162, 126, 126, 126, 126, + 126, 126, 126, 126, 1469, 1469, 1469, 1469, 1469, 1469, 1469, 1469, -853, + -853, -853, -853, -853, -853, -853, -853, -90, -90, -90, -90, -90, -90, -90, + -90, -271, -271, -271, -271, -271, -271, -271, -271, 830, 830, 830, 830, + 830, 830, 830, 830, 107, 107, 107, 107, 107, 107, 107, 107, -1421, -1421, + -1421, -1421, -1421, -1421, -1421, -1421, -247, -247, -247, -247, -247, + -247, -247, -247, -951, -951, -951, -951, -951, -951, -951, -951, -398, + -398, -398, -398, -398, -398, -398, -398, 961, 961, 961, 961, 961, 961, 961, + 961, -1508, -1508, -1508, -1508, -1508, -1508, -1508, -1508, -725, -725, + -725, -725, -725, -725, -725, -725, 448, 448, 448, 448, 448, 448, 448, 448, + -1065, -1065, -1065, -1065, -1065, -1065, -1065, -1065, 677, 677, 677, 677, + 677, 677, 677, 677, -1275, -1275, -1275, -1275, -1275, -1275, -1275, -1275, + /* For intt Len=2, offset IZETA_NTT_OFFSET64 */ + -1103, -1103, -1103, -1103, 430, 430, 430, 430, 555, 555, 555, 555, 843, + 843, 843, 843, -1251, -1251, -1251, -1251, 871, 871, 871, 871, 1550, 1550, + 1550, 1550, 105, 105, 105, 105, 422, 422, 422, 422, 587, 587, 587, 587, 177, + 177, 177, 177, -235, -235, -235, -235, -291, -291, -291, -291, -460, -460, + -460, -460, 1574, 1574, 1574, 1574, 1653, 1653, 1653, 1653, -246, -246, + -246, -246, 778, 778, 778, 778, 1159, 1159, 1159, 1159, -147, -147, -147, + -147, -777, -777, -777, -777, 1483, 1483, 1483, 1483, -602, -602, -602, + -602, 1119, 1119, 1119, 1119, -1590, -1590, -1590, -1590, 644, 644, 644, + 644, -872, -872, -872, -872, 349, 349, 349, 349, 418, 418, 418, 418, 329, + 329, 329, 329, -156, -156, -156, -156, -75, -75, -75, -75, 817, 817, 817, + 817, 1097, 1097, 1097, 1097, 603, 603, 603, 603, 610, 610, 610, 610, 1322, + 1322, 1322, 1322, -1285, -1285, -1285, -1285, -1465, -1465, -1465, -1465, + 384, 384, 384, 384, -1215, -1215, -1215, -1215, -136, -136, -136, -136, + 1218, 1218, 1218, 1218, -1335, -1335, -1335, -1335, -874, -874, -874, -874, + 220, 220, 220, 220, -1187, -1187, -1187, -1187, -1659, -1659, -1659, -1659, + -1185, -1185, -1185, -1185, -1530, -1530, -1530, -1530, -1278, -1278, -1278, + -1278, 794, 794, 794, 794, -1510, -1510, -1510, -1510, -854, -854, -854, + -854, -870, -870, -870, -870, 478, 478, 478, 478, -108, -108, -108, -108, + -308, -308, -308, -308, 996, 996, 996, 996, 991, 991, 991, 991, 958, 958, + 958, 958, -1460, -1460, -1460, -1460, 1522, 1522, 1522, 1522, 1628, 1628, + 1628, 1628, + /* For intt Len=2, offset IZETA_NTT_OFFSET127 */ + 1628, 1628, 1628, 1628, 1522, 1522, 1522, 1522, -1460, -1460, -1460, -1460, + 958, 958, 958, 958, 991, 991, 991, 991, 996, 996, 996, 996, -308, -308, + -308, -308, -108, -108, -108, -108, 478, 478, 478, 478, -870, -870, -870, + -870, -854, -854, -854, -854, -1510, -1510, -1510, -1510, 794, 794, 794, + 794, -1278, -1278, -1278, -1278, -1530, -1530, -1530, -1530, -1185, -1185, + -1185, -1185, -1659, -1659, -1659, -1659, -1187, -1187, -1187, -1187, 220, + 220, 220, 220, -874, -874, -874, -874, -1335, -1335, -1335, -1335, 1218, + 1218, 1218, 1218, -136, -136, -136, -136, -1215, -1215, -1215, -1215, 384, + 384, 384, 384, -1465, -1465, -1465, -1465, -1285, -1285, -1285, -1285, 1322, + 1322, 1322, 1322, 610, 610, 610, 610, 603, 603, 603, 603, 1097, 1097, 1097, + 1097, 817, 817, 817, 817, -75, -75, -75, -75, -156, -156, -156, -156, 329, + 329, 329, 329, 418, 418, 418, 418, 349, 349, 349, 349, -872, -872, -872, + -872, 644, 644, 644, 644, -1590, -1590, -1590, -1590, 1119, 1119, 1119, + 1119, -602, -602, -602, -602, 1483, 1483, 1483, 1483, -777, -777, -777, + -777, -147, -147, -147, -147, 1159, 1159, 1159, 1159, 778, 778, 778, 778, + -246, -246, -246, -246, 1653, 1653, 1653, 1653, 1574, 1574, 1574, 1574, + -460, -460, -460, -460, -291, -291, -291, -291, -235, -235, -235, -235, 177, + 177, 177, 177, 587, 587, 587, 587, 422, 422, 422, 422, 105, 105, 105, 105, + 1550, 1550, 1550, 1550, 871, 871, 871, 871, -1251, -1251, -1251, -1251, 843, + 843, 843, 843, 555, 555, 555, 555, 430, 430, 430, 430, -1103, -1103, -1103, + -1103, + /* For intt Len=4 and others, offset IZETA_NTT_OFFSET63 */ + -1275, -1275, -1275, -1275, -1275, -1275, -1275, -1275, 677, 677, 677, 677, + 677, 677, 677, 677, -1065, -1065, -1065, -1065, -1065, -1065, -1065, -1065, + 448, 448, 448, 448, 448, 448, 448, 448, -725, -725, -725, -725, -725, -725, + -725, -725, -1508, -1508, -1508, -1508, -1508, -1508, -1508, -1508, 961, + 961, 961, 961, 961, 961, 961, 961, -398, -398, -398, -398, -398, -398, -398, + -398, -951, -951, -951, -951, -951, -951, -951, -951, -247, -247, -247, + -247, -247, -247, -247, -247, -1421, -1421, -1421, -1421, -1421, -1421, + -1421, -1421, 107, 107, 107, 107, 107, 107, 107, 107, 830, 830, 830, 830, + 830, 830, 830, 830, -271, -271, -271, -271, -271, -271, -271, -271, -90, + -90, -90, -90, -90, -90, -90, -90, -853, -853, -853, -853, -853, -853, -853, + -853, 1469, 1469, 1469, 1469, 1469, 1469, 1469, 1469, 126, 126, 126, 126, + 126, 126, 126, 126, -1162, -1162, -1162, -1162, -1162, -1162, -1162, -1162, + -1618, -1618, -1618, -1618, -1618, -1618, -1618, -1618, -666, -666, -666, + -666, -666, -666, -666, -666, -320, -320, -320, -320, -320, -320, -320, + -320, -8, -8, -8, -8, -8, -8, -8, -8, 516, 516, 516, 516, 516, 516, 516, + 516, -1544, -1544, -1544, -1544, -1544, -1544, -1544, -1544, -282, -282, + -282, -282, -282, -282, -282, -282, 1491, 1491, 1491, 1491, 1491, 1491, + 1491, 1491, -1293, -1293, -1293, -1293, -1293, -1293, -1293, -1293, 1015, + 1015, 1015, 1015, 1015, 1015, 1015, 1015, -552, -552, -552, -552, -552, + -552, -552, -552, 652, 652, 652, 652, 652, 652, 652, 652, 1223, 1223, 1223, + 1223, 1223, 1223, 1223, 1223, -1571, -1571, -1571, -1571, -1571, -1571, + -1571, -1571, -205, -205, -205, -205, -205, -205, -205, -205, 411, 411, 411, + 411, 411, 411, 411, 411, -1542, -1542, -1542, -1542, -1542, -1542, -1542, + -1542, 608, 608, 608, 608, 608, 608, 608, 608, 732, 732, 732, 732, 732, 732, + 732, 732, 1017, 1017, 1017, 1017, 1017, 1017, 1017, 1017, -681, -681, -681, + -681, -681, -681, -681, -681, -130, -130, -130, -130, -130, -130, -130, + -130, -1602, -1602, -1602, -1602, -1602, -1602, -1602, -1602, 1458, 1458, + 1458, 1458, 1458, 1458, 1458, 1458, -829, -829, -829, -829, -829, -829, + -829, -829, 383, 383, 383, 383, 383, 383, 383, 383, 264, 264, 264, 264, 264, + 264, 264, 264, -1325, -1325, -1325, -1325, -1325, -1325, -1325, -1325, 573, + 573, 573, 573, 573, 573, 573, 573, 1468, 1468, 1468, 1468, 1468, 1468, 1468, + 1468, -1474, -1474, -1474, -1474, -1474, -1474, -1474, -1474, -1202, -1202, + -1202, -1202, -1202, -1202, -1202, -1202, 962, 962, 962, 962, 962, 962, 962, + 962, 182, 182, 182, 182, 182, 182, 182, 182, 1577, 1577, 1577, 1577, 1577, + 1577, 1577, 1577, 622, 622, 622, 622, 622, 622, 622, 622, -171, -171, -171, + -171, -171, -171, -171, -171, 202, 202, 202, 202, 202, 202, 202, 202, 287, + 287, 287, 287, 287, 287, 287, 287, 1422, 1422, 1422, 1422, 1422, 1422, 1422, + 1422, 1493, 1493, 1493, 1493, 1493, 1493, 1493, 1493, -1517, -1517, -1517, + -1517, -1517, -1517, -1517, -1517, -359, -359, -359, -359, -359, -359, -359, + -359, -758, -758, -758, -758, -758, -758, -758, -758}; + +#endif /* MLK_ARITH_BACKEND_PPC64LE_DEFAULT && \ + !MLK_CONFIG_MULTILEVEL_NO_SHARED */ diff --git a/mlkem/src/native/ppc64le/src/consts.h b/mlkem/src/native/ppc64le/src/consts.h new file mode 100644 index 000000000..49f519d0c --- /dev/null +++ b/mlkem/src/native/ppc64le/src/consts.h @@ -0,0 +1,26 @@ +/* + * Copyright (c) The mlkem-native project authors + * SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT + */ + +#ifndef MLK_NATIVE_PPC64LE_SRC_CONSTS_H +#define MLK_NATIVE_PPC64LE_SRC_CONSTS_H +#include "../../../common.h" + +#define NQ_OFFSET 0 +#define QINV_OFFSET 16 +#define Q_OFFSET 32 +#define C20159_OFFSET 48 +#define C1441_OFFSET 64 +#define C1353_OFFSET 80 +#define ZETA_NTT_OFFSET 96 +#define ZETA_NTT_OFFSET64 1104 +#define IZETA_NTT_OFFSET127 1616 +#define IZETA_NTT_OFFSET63 2128 + +#ifndef __ASSEMBLER__ +#define mlk_ppc_qdata MLK_NAMESPACE(ppc_qdata) +extern const int16_t mlk_ppc_qdata[]; +#endif + +#endif /* !MLK_NATIVE_PPC64LE_SRC_CONSTS_H */ diff --git a/mlkem/src/native/ppc64le/src/intt_ppc.S b/mlkem/src/native/ppc64le/src/intt_ppc.S index feb78b984..1a4975ba0 100644 --- a/mlkem/src/native/ppc64le/src/intt_ppc.S +++ b/mlkem/src/native/ppc64le/src/intt_ppc.S @@ -11,13 +11,17 @@ # #include "../../../common.h" +#if defined(MLK_ARITH_BACKEND_PPC64LE_DEFAULT) && \ + !defined(MLK_CONFIG_MULTILEVEL_NO_SHARED) + +#include "consts.h" .machine "any" .text # Barrett reduce constatnts #define V20159 0 -#define V_25 1 +#define V_25 1 #define V_26 2 #define V_MKQ 3 @@ -29,11 +33,11 @@ #define V_Z2 9 #define V_Z3 10 #define V_ZETA 10 -#define V1441 10 +#define V1441 10 .macro Load_4Coeffs start next step mr 9, \start # j - add 10, 4, 9 # J + len*2 + add 10, 7, 9 # J + len*2 addi 16, 9, \next addi 17, 10, \step addi 18, 16, \next @@ -73,6 +77,8 @@ xxlor 32+3, 6, 6 # V_MKQ xxlor 32+1, 7, 7 # V_25 xxlor 32+2, 8, 8 # V_26 + # Multify Odd/Even signed halfword; + # Results word bound by 2^32 in abs value. vmulosh 6, 8, V20159 vmulesh 5, 8, V20159 vmulosh 11, 12, V20159 @@ -97,6 +103,8 @@ vadduwm 14, 14, V_25 vadduwm 17, 17, V_25 vadduwm 18, 18, V_25 + # Right shift and pack lower halfword, + # results bond to 2^16 in abs value vsraw 4, 4, V_26 vsraw 5, 5, V_26 vsraw 9, 9, V_26 @@ -113,6 +121,8 @@ vsubuhm 13, 7, 13 vpkuwum 17, 18, 17 vsubuhm 17, 7, 17 + # Modulo multify-Low unsigned halfword; + # results bond to 2^16 * q in abs value. vmladduhm \_v0, 4, V_MKQ, 8 vmladduhm \_v1, 9, V_MKQ, 12 vmladduhm \_v2, 13, V_MKQ, 16 @@ -123,11 +133,13 @@ # MREDUCE_4X(len, start, _vz0, _vz1, _vz2, _vz3) # .macro MREDUCE_4X _vz0 _vz1 _vz2 _vz3 _vo0 _vo1 _vo2 _vo3 + # Modular multification bond by 2^16 * q in abs value vmladduhm 15, 25, \_vz0, 3 vmladduhm 20, 26, \_vz1, 3 vmladduhm 27, 30, \_vz2, 3 vmladduhm 28, 31, \_vz3, 3 + # Signed multiply-high-round; outputs are bound by 2^15 * q in abs value vmhraddshs 14, 25, \_vz0, 3 vmhraddshs 19, 26, \_vz1, 3 vmhraddshs 24, 30, \_vz2, 3 @@ -265,50 +277,41 @@ MLK_ASM_FN_SYMBOL(intt_ppc) # init vectors and constants # Setup for Montgomery reduce - addis 8,2,.nmkq@toc@ha - addi 8,8,.nmkq@toc@l - lxv 0, 0(8) + lxv 0, 0(4) - lxv 32+V_QINV, 16(8) # QINV + lxv 32+V_QINV, QINV_OFFSET(4) # QINV xxlxor 32+3, 32+3, 32+3 vspltish 4, 1 - xxlor 2, 32+2, 32+2 - xxlor 3, 32+3, 32+3 - xxlor 4, 32+4, 32+4 + xxlor 2, 32+2, 32+2 # QINV + xxlor 3, 32+3, 32+3 # 0 + xxlor 4, 32+4, 32+4 # 1 # Setup for Barrett reduce - addis 8,2,.mkq@toc@ha - addi 8,8,.mkq@toc@l - addis 9,2,.C20159@toc@ha - addi 9,9,.C20159@toc@l - addis 10,2,.C25@toc@ha - addi 10,10,.C25@toc@l - - lxv 6, 0(8) # V_MKQ - lxv 32+0, 0(9) # V20159 - lxv 7, 0(10) # V_25 + lxv 6, Q_OFFSET(4) # V_MKQ + lxv 32+V20159, C20159_OFFSET(4) # V20159 + lxv 7, 0(4) # V_25 #xxspltiw 8, 26 # for power9 and above vspltisw 8, 13 vadduwm 8, 8, 8 - xxlor 8, 32+8, 32+8 + xxlor 8, 32+8, 32+8 # V_26 store at vs8 - # zetas array - #addis 14,2,.izeta63@toc@ha - #addi 14,14,.izeta63@toc@l + vspltisw 9, 1 + vsubuwm 10, 8, 9 # 25 + vslw 9, 9, 10 + xxlor 7, 32+9, 32+9 # V_25 syore at vs7 .align 4 -__Len2: +#__Len2: # # 1. len = 2, start = 0, 4, 8, 12,...244, 248, 252 # Update zetas vectors, each vector has 2 zetas - addis 14,2,.izeta127@toc@ha - addi 14,14,.izeta127@toc@l - li 4, 4 + addi 14, 4, IZETA_NTT_OFFSET127 + li 7, 4 li 15, 4 mtctr 15 li 5, 0 -__Loop2: +intt_ppc__Loop2: Load_4Coeffs 5, 16, 16 BREDUCE_4X 4, 9, 13, 17 xxlor 10, 32+4, 32+4 @@ -333,19 +336,18 @@ __Loop2: MREDUCE_4X V_Z0, V_Z1, V_Z2, V_Z3, 13, 18, 23, 28 Write_Len2_4C 32+13, 32+18, 32+23, 32+28 addi 5, 5, 64 - bdnz __Loop2 + bdnz intt_ppc__Loop2 .align 4 -__Len4: +#__Len4: # # 2. len = 4, start = 0, 8, 16, 24,...232, 240, 248 - addis 14,2,.izeta63@toc@ha - addi 14,14,.izeta63@toc@l + addi 14, 4, IZETA_NTT_OFFSET63 li 5, 0 - li 4, 8 + li 7, 8 li 15, 4 # loops mtctr 15 -__Loop4: +intt_ppc__Loop4: Load_4Coeffs 5, 16, 16 BREDUCE_4X 4, 9, 13, 17 xxlor 10, 32+4, 32+4 @@ -369,13 +371,13 @@ __Loop4: MREDUCE_4X V_Z0, V_Z1, V_Z2, V_Z3, 13, 18, 23, 28 Write_Len4_4C 32+13, 32+18, 32+23, 32+28 addi 5, 5, 64 - bdnz __Loop4 + bdnz intt_ppc__Loop4 .align 4 -__Len8: +#__Len8: # 3. len = 8, start = 0, 16, 32, 48,...208, 224, 240 #addi 14, 14, 512 - li 4, 16 + li 7, 16 li 5, 0 Load_4Coeffs 5, 32, 32 @@ -414,12 +416,12 @@ __Len8: Write_M4C 32+13, 32+18, 32+23, 32+28 .align 4 -__Len16: +#__Len16: # # 4. len = 16, start = 0, 32, 64,,...160, 192, 224 #addi 14, 14, 768 li 5, 0 - li 4, 32 + li 7, 32 Load_4Coeffs 5, 64, 64 BREDUCE_4X 4, 9, 13, 17 @@ -458,12 +460,12 @@ __Len16: Write_M4C 32+13, 32+18, 32+23, 32+28 .align 4 -__Len32: +#__Len32: # # 5. len = 32, start = 0, 64, 128, 192 #addi 14, 14, 896 li 5, 0 - li 4, 64 + li 7, 64 Load_4Coeffs 5, 16, 16 BREDUCE_4X 4, 9, 13, 17 @@ -505,12 +507,12 @@ __Len32: Write_M4C 32+13, 32+18, 32+23, 32+28 .align 4 -__Len64: +#__Len64: # # 6. len = 64, start = 0, 128 #addi 14, 14, 960 li 5, 0 - li 4, 128 + li 7, 128 Load_4Coeffs 5, 16, 16 BREDUCE_4X 4, 9, 13, 17 Write_B4C 32+4, 32+9, 32+13, 32+17 @@ -549,12 +551,12 @@ __Len64: Write_M4C 32+13, 32+18, 32+23, 32+28 .align 4 -__Len128: +#__Len128: # 7. len = 128, start = 0 # #addi 14, 14, 992 li 5, 0 # start - li 4, 256 # len * 2 + li 7, 256 # len * 2 Load_4Coeffs 5, 16, 16 BREDUCE_4X 4, 9, 13, 17 @@ -596,9 +598,8 @@ __Len128: # # Montgomery reduce loops with constant 1441 # - addis 10,2,.C1441@toc@ha - addi 10,10,.C1441@toc@l - lvx V1441, 0, 10 + addi 14, 4, C1441_OFFSET + lvx V1441, 0, 14 Reload_4coeffs MREDUCE_4X V1441, V1441, V1441, V1441, 6, 7, 8, 9 @@ -624,7 +625,6 @@ __Len128: MREDUCE_4X V1441, V1441, V1441, V1441, 13, 18, 23, 28 MWrite_8X 32+6, 32+7, 32+8, 32+9, 32+13, 32+18, 32+23, 32+28 -__intt_out: lxv 32+20, 128(1) lxv 32+21, 144(1) lxv 32+22, 160(1) @@ -651,123 +651,35 @@ __intt_out: addi 1, 1, 352 blr -.data -.align 4 -# -MLKEM_Q -.nmkq: -.short -3329, -3329, -3329, -3329, -3329, -3329, -3329, -3329 -# QINV -.short -3327, -3327, -3327, -3327, -3327, -3327, -3327, -3327 - -# MLKEM_Q -.mkq: -.short 3329, 3329, 3329, 3329, 3329, 3329, 3329, 3329 - -.C20159: -.short 20159, 20159, 20159, 20159, 20159, 20159, 20159, 20159 - -# 0x2000000 -.C25: -.long 33554432, 33554432, 33554432, 33554432 - -.C1441: -.short 1441, 1441, 1441, 1441, 1441, 1441, 1441, 1441 - -.align 4 -.izeta127: -.short 1628, 1628, 1628, 1628, 1522, 1522, 1522, 1522 -.short -1460, -1460, -1460, -1460, 958, 958, 958, 958 -.short 991, 991, 991, 991, 996, 996, 996, 996 -.short -308, -308, -308, -308, -108, -108, -108, -108 -.short 478, 478, 478, 478, -870, -870, -870, -870 -.short -854, -854, -854, -854, -1510, -1510, -1510, -1510 -.short 794, 794, 794, 794, -1278, -1278, -1278, -1278 -.short -1530, -1530, -1530, -1530, -1185, -1185, -1185, -1185 -.short -1659, -1659, -1659, -1659, -1187, -1187, -1187, -1187 -.short 220, 220, 220, 220, -874, -874, -874, -874 -.short -1335, -1335, -1335, -1335, 1218, 1218, 1218, 1218 -.short -136, -136, -136, -136, -1215, -1215, -1215, -1215 -.short 384, 384, 384, 384, -1465, -1465, -1465, -1465 -.short -1285, -1285, -1285, -1285, 1322, 1322, 1322, 1322 -.short 610, 610, 610, 610, 603, 603, 603, 603 -.short 1097, 1097, 1097, 1097, 817, 817, 817, 817 -.short -75, -75, -75, -75, -156, -156, -156, -156 -.short 329, 329, 329, 329, 418, 418, 418, 418 -.short 349, 349, 349, 349, -872, -872, -872, -872 -.short 644, 644, 644, 644, -1590, -1590, -1590, -1590 -.short 1119, 1119, 1119, 1119, -602, -602, -602, -602 -.short 1483, 1483, 1483, 1483, -777, -777, -777, -777 -.short -147, -147, -147, -147, 1159, 1159, 1159, 1159 -.short 778, 778, 778, 778, -246, -246, -246, -246 -.short 1653, 1653, 1653, 1653, 1574, 1574, 1574, 1574 -.short -460, -460, -460, -460, -291, -291, -291, -291 -.short -235, -235, -235, -235, 177, 177, 177, 177 -.short 587, 587, 587, 587, 422, 422, 422, 422 -.short 105, 105, 105, 105, 1550, 1550, 1550, 1550 -.short 871, 871, 871, 871, -1251, -1251, -1251, -1251 -.short 843, 843, 843, 843, 555, 555, 555, 555 -.short 430, 430, 430, 430, -1103, -1103, -1103, -1103 -.izeta63: -.short -1275, -1275, -1275, -1275, -1275, -1275, -1275, -1275 -.short 677, 677, 677, 677, 677, 677, 677, 677 -.short -1065, -1065, -1065, -1065, -1065, -1065, -1065, -1065 -.short 448, 448, 448, 448, 448, 448, 448, 448 -.short -725, -725, -725, -725, -725, -725, -725, -725 -.short -1508, -1508, -1508, -1508, -1508, -1508, -1508, -1508 -.short 961, 961, 961, 961, 961, 961, 961, 961 -.short -398, -398, -398, -398, -398, -398, -398, -398 -.short -951, -951, -951, -951, -951, -951, -951, -951 -.short -247, -247, -247, -247, -247, -247, -247, -247 -.short -1421, -1421, -1421, -1421, -1421, -1421, -1421, -1421 -.short 107, 107, 107, 107, 107, 107, 107, 107 -.short 830, 830, 830, 830, 830, 830, 830, 830 -.short -271, -271, -271, -271, -271, -271, -271, -271 -.short -90, -90, -90, -90, -90, -90, -90, -90 -.short -853, -853, -853, -853, -853, -853, -853, -853 -.short 1469, 1469, 1469, 1469, 1469, 1469, 1469, 1469 -.short 126, 126, 126, 126, 126, 126, 126, 126 -.short -1162, -1162, -1162, -1162, -1162, -1162, -1162, -1162 -.short -1618, -1618, -1618, -1618, -1618, -1618, -1618, -1618 -.short -666, -666, -666, -666, -666, -666, -666, -666 -.short -320, -320, -320, -320, -320, -320, -320, -320 -.short -8, -8, -8, -8, -8, -8, -8, -8 -.short 516, 516, 516, 516, 516, 516, 516, 516 -.short -1544, -1544, -1544, -1544, -1544, -1544, -1544, -1544 -.short -282, -282, -282, -282, -282, -282, -282, -282 -.short 1491, 1491, 1491, 1491, 1491, 1491, 1491, 1491 -.short -1293, -1293, -1293, -1293, -1293, -1293, -1293, -1293 -.short 1015, 1015, 1015, 1015, 1015, 1015, 1015, 1015 -.short -552, -552, -552, -552, -552, -552, -552, -552 -.short 652, 652, 652, 652, 652, 652, 652, 652 -.short 1223, 1223, 1223, 1223, 1223, 1223, 1223, 1223 -.short -1571, -1571, -1571, -1571, -1571, -1571, -1571, -1571 -.short -205, -205, -205, -205, -205, -205, -205, -205 -.short 411, 411, 411, 411, 411, 411, 411, 411 -.short -1542, -1542, -1542, -1542, -1542, -1542, -1542, -1542 -.short 608, 608, 608, 608, 608, 608, 608, 608 -.short 732, 732, 732, 732, 732, 732, 732, 732 -.short 1017, 1017, 1017, 1017, 1017, 1017, 1017, 1017 -.short -681, -681, -681, -681, -681, -681, -681, -681 -.short -130, -130, -130, -130, -130, -130, -130, -130 -.short -1602, -1602, -1602, -1602, -1602, -1602, -1602, -1602 -.short 1458, 1458, 1458, 1458, 1458, 1458, 1458, 1458 -.short -829, -829, -829, -829, -829, -829, -829, -829 -.short 383, 383, 383, 383, 383, 383, 383, 383 -.short 264, 264, 264, 264, 264, 264, 264, 264 -.short -1325, -1325, -1325, -1325, -1325, -1325, -1325, -1325 -.short 573, 573, 573, 573, 573, 573, 573, 573 -.short 1468, 1468, 1468, 1468, 1468, 1468, 1468, 1468 -.short -1474, -1474, -1474, -1474, -1474, -1474, -1474, -1474 -.short -1202, -1202, -1202, -1202, -1202, -1202, -1202, -1202 -.short 962, 962, 962, 962, 962, 962, 962, 962 -.short 182, 182, 182, 182, 182, 182, 182, 182 -.short 1577, 1577, 1577, 1577, 1577, 1577, 1577, 1577 -.short 622, 622, 622, 622, 622, 622, 622, 622 -.short -171, -171, -171, -171, -171, -171, -171, -171 -.short 202, 202, 202, 202, 202, 202, 202, 202 -.short 287, 287, 287, 287, 287, 287, 287, 287 -.short 1422, 1422, 1422, 1422, 1422, 1422, 1422, 1422 -.short 1493, 1493, 1493, 1493, 1493, 1493, 1493, 1493 -.short -1517, -1517, -1517, -1517, -1517, -1517, -1517, -1517 -.short -359, -359, -359, -359, -359, -359, -359, -359 -.short -758, -758, -758, -758, -758, -758, -758, -758 +/* To facilitate single-compilation-unit (SCU) builds, undefine all macros. + * Don't modify by hand -- this is auto-generated by scripts/autogen. */ +#undef V20159 +#undef V_25 +#undef V_26 +#undef V_MKQ +#undef V_QINV +#undef V_NMKQ +#undef V_Z0 +#undef V_Z1 +#undef V_Z2 +#undef V_Z3 +#undef V_ZETA +#undef V1441 + +#endif /* MLK_ARITH_BACKEND_PPC64LE_DEFAULT && \ + !MLK_CONFIG_MULTILEVEL_NO_SHARED */ + +/* To facilitate single-compilation-unit (SCU) builds, undefine all macros. + * Don't modify by hand -- this is auto-generated by scripts/autogen. */ +#undef V20159 +#undef V_25 +#undef V_26 +#undef V_MKQ +#undef V_QINV +#undef V_NMKQ +#undef V_Z0 +#undef V_Z1 +#undef V_Z2 +#undef V_Z3 +#undef V_ZETA +#undef V1441 diff --git a/mlkem/src/native/ppc64le/src/ntt_ppc.S b/mlkem/src/native/ppc64le/src/ntt_ppc.S index 172fef9cc..e9a8df81f 100644 --- a/mlkem/src/native/ppc64le/src/ntt_ppc.S +++ b/mlkem/src/native/ppc64le/src/ntt_ppc.S @@ -11,14 +11,18 @@ # #include "../../../common.h" +#if defined(MLK_ARITH_BACKEND_PPC64LE_DEFAULT) && \ + !defined(MLK_CONFIG_MULTILEVEL_NO_SHARED) -#define V_QINV 2 -#define V_NMKQ 5 -#define V_Z0 7 -#define V_Z1 8 -#define V_Z2 9 -#define V_Z3 10 -#define V_ZETA 10 +#include "consts.h" + +#define V_QINV 2 +#define V_NMKQ 5 +#define V_Z0 7 +#define V_Z1 8 +#define V_Z2 9 +#define V_Z3 10 +#define V_ZETA 10 .machine "any" .text @@ -33,7 +37,7 @@ # .macro MREDUCE_4X start next step _vz0 _vz1 _vz2 _vz3 mr 9, \start - add 10, 4, 9 # J + len*2 + add 10, 7, 9 # J + len*2 addi 16, 9, \next addi 17, 10, \step addi 18, 16, \next @@ -50,11 +54,13 @@ xxpermdi 32+28, 32+28, 32+28, 2 # fqmul = zeta * coefficient + # Modular multification bond by 2^16 * q in abs value vmladduhm 15, 13, \_vz0, 3 vmladduhm 20, 18, \_vz1, 3 vmladduhm 25, 23, \_vz2, 3 vmladduhm 30, 28, \_vz3, 3 + # Signed multiply-high-round; outputs are bound by 2^15 * q in abs value vmhraddshs 14, 13, \_vz0, 3 vmhraddshs 19, 18, \_vz1, 3 vmhraddshs 24, 23, \_vz2, 3 @@ -84,6 +90,9 @@ xxpermdi 32+22, 32+22, 32+22, 2 xxpermdi 32+27, 32+27, 32+27, 2 + # Since the result of the Montgomery multiplication is bounded + # by q in absolute value. + # Finally to complete the final update of the results with add/sub vsubuhm 16, 12, 13 # r - t vadduhm 15, 13, 12 # r + t vsubuhm 21, 17, 18 # r - t @@ -175,20 +184,18 @@ MLK_ASM_FN_SYMBOL(ntt_ppc) stxv 32+31, 304(1) # get MLKEM_Q - addis 8,2,.nmkq@toc@ha - addi 8,8,.nmkq@toc@l - lvx V_NMKQ,0,8 + lvx V_NMKQ,0,4 # zetas array - addis 14,2,.K1@toc@ha - addi 14,14,.K1@toc@l + addi 14, 4, ZETA_NTT_OFFSET vxor 3, 3, 3 vspltish 4, 1 - lxv 32+V_QINV, 16(8) + + lxv 32+V_QINV, QINV_OFFSET(4) .align 4 -__Len128: +#__Len128: # # Compute coefficients of the NTT based on the following loop. # for (len = 128; len ≥ 2; len = len/2) @@ -196,7 +203,7 @@ __Len128: # 1. len = 128, start = 0 # li 5, 0 # start - li 4, 256 # len * 2 + li 7, 256 # len * 2 lvx V_ZETA, 0, 14 addi 14, 14, 16 @@ -213,12 +220,12 @@ __Len128: Write_One .align 4 -__Len64: +#__Len64: # # 2. len = 64, start = 0, 128 # k += 2 li 5, 0 - li 4, 128 + li 7, 128 lvx V_ZETA, 0, 14 addi 14, 14, 16 MREDUCE_4X 5, 16, 16, V_ZETA, V_ZETA, V_ZETA, V_ZETA @@ -237,12 +244,12 @@ __Len64: Write_One .align 4 -__Len32: +#__Len32: # # 3. len = 32, start = 0, 64, 128, 192 # k += 4 li 5, 0 - li 4, 64 + li 7, 64 lvx V_ZETA, 0, 14 addi 14, 14, 16 MREDUCE_4X 5, 16, 16, V_ZETA, V_ZETA, V_ZETA, V_ZETA @@ -270,12 +277,12 @@ __Len32: Write_One .align 4 -__Len16: +#__Len16: # # 4. len = 16, start = 0, 32, 64,,...160, 192, 224 # k += 8 li 5, 0 - li 4, 32 + li 7, 32 Load_next_4zetas MREDUCE_4X 5, 64, 64, V_Z0, V_Z1, V_Z2, V_Z3 Write_One @@ -292,12 +299,12 @@ __Len16: Write_One .align 4 -__Len8: +#__Len8: # # 5. len = 8, start = 0, 16, 32, 48,...208, 224, 240 # k += 16 li 5, 0 - li 4, 16 + li 7, 16 Load_next_4zetas MREDUCE_4X 5, 32, 32, V_Z0, V_Z1, V_Z2, V_Z3 Write_One @@ -323,9 +330,9 @@ __Len8: li 15, 4 # loops mtctr 15 li 5, 0 - li 4, 8 + li 7, 8 .align 4 -__Len4: +ntt_ppc__Len4: Load_next_4zetas MREDUCE_4X 5, 16, 16, V_Z0, V_Z1, V_Z2, V_Z3 Write_Two @@ -336,21 +343,21 @@ __Len4: Write_Two addi 5, 5, 64 - bdnz __Len4 + bdnz ntt_ppc__Len4 # # 7. len = 2, start = 0, 4, 8, 12,...244, 248, 252 # k += 64 # Update zetas vectors, each vector has 2 zetas - addis 14,2,.K64@toc@ha - addi 14,14,.K64@toc@l + + addi 14, 4, ZETA_NTT_OFFSET64 li 15, 4 mtctr 15 li 5, 0 - li 4, 4 + li 7, 4 .align 4 -__Len2: +ntt_ppc__Len2: Load_next_4zetas MREDUCE_4X 5, 16, 16, V_Z0, V_Z1, V_Z2, V_Z3 Write_Three @@ -361,9 +368,8 @@ __Len2: Write_Three addi 5, 5, 64 - bdnz __Len2 + bdnz ntt_ppc__Len2 -__ntt_out: lxv 32+20, 128(1) lxv 32+21, 144(1) lxv 32+22, 160(1) @@ -390,109 +396,17 @@ __ntt_out: addi 1, 1, 352 blr -.data -.align 4 -# -MLKEM_Q -.nmkq: -.short -3329, -3329, -3329, -3329, -3329, -3329, -3329, -3329 -# QINV -.short -3327, -3327, -3327, -3327, -3327, -3327, -3327, -3327 - -# zetas -.K1: -.short -758, -758, -758, -758, -758, -758, -758, -758 -.short -359, -359, -359, -359, -359, -359, -359, -359 -.short -1517, -1517, -1517, -1517, -1517, -1517, -1517, -1517 -.short 1493, 1493, 1493, 1493, 1493, 1493, 1493, 1493 -.short 1422, 1422, 1422, 1422, 1422, 1422, 1422, 1422 -.short 287, 287, 287, 287, 287, 287, 287, 287 -.short 202, 202, 202, 202, 202, 202, 202, 202 -.short -171, -171, -171, -171, -171, -171, -171, -171 -.short 622, 622, 622, 622, 622, 622, 622, 622 -.short 1577, 1577, 1577, 1577, 1577, 1577, 1577, 1577 -.short 182, 182, 182, 182, 182, 182, 182, 182 -.short 962, 962, 962, 962, 962, 962, 962, 962 -.short -1202, -1202, -1202, -1202, -1202, -1202, -1202, -1202 -.short -1474, -1474, -1474, -1474, -1474, -1474, -1474, -1474 -.short 1468, 1468, 1468, 1468, 1468, 1468, 1468, 1468 -.short 573, 573, 573, 573, 573, 573, 573, 573 -.short -1325, -1325, -1325, -1325, -1325, -1325, -1325, -1325 -.short 264, 264, 264, 264, 264, 264, 264, 264 -.short 383, 383, 383, 383, 383, 383, 383, 383 -.short -829, -829, -829, -829, -829, -829, -829, -829 -.short 1458, 1458, 1458, 1458, 1458, 1458, 1458, 1458 -.short -1602, -1602, -1602, -1602, -1602, -1602, -1602, -1602 -.short -130, -130, -130, -130, -130, -130, -130, -130 -.short -681, -681, -681, -681, -681, -681, -681, -681 -.short 1017, 1017, 1017, 1017, 1017, 1017, 1017, 1017 -.short 732, 732, 732, 732, 732, 732, 732, 732 -.short 608, 608, 608, 608, 608, 608, 608, 608 -.short -1542, -1542, -1542, -1542, -1542, -1542, -1542, -1542 -.short 411, 411, 411, 411, 411, 411, 411, 411 -.short -205, -205, -205, -205, -205, -205, -205, -205 -.short -1571, -1571, -1571, -1571, -1571, -1571, -1571, -1571 -.short 1223, 1223, 1223, 1223, 1223, 1223, 1223, 1223 -.short 652, 652, 652, 652, 652, 652, 652, 652 -.short -552, -552, -552, -552, -552, -552, -552, -552 -.short 1015, 1015, 1015, 1015, 1015, 1015, 1015, 1015 -.short -1293, -1293, -1293, -1293, -1293, -1293, -1293, -1293 -.short 1491, 1491, 1491, 1491, 1491, 1491, 1491, 1491 -.short -282, -282, -282, -282, -282, -282, -282, -282 -.short -1544, -1544, -1544, -1544, -1544, -1544, -1544, -1544 -.short 516, 516, 516, 516, 516, 516, 516, 516 -.short -8, -8, -8, -8, -8, -8, -8, -8 -.short -320, -320, -320, -320, -320, -320, -320, -320 -.short -666, -666, -666, -666, -666, -666, -666, -666 -.short -1618, -1618, -1618, -1618, -1618, -1618, -1618, -1618 -.short -1162, -1162, -1162, -1162, -1162, -1162, -1162, -1162 -.short 126, 126, 126, 126, 126, 126, 126, 126 -.short 1469, 1469, 1469, 1469, 1469, 1469, 1469, 1469 -.short -853, -853, -853, -853, -853, -853, -853, -853 -.short -90, -90, -90, -90, -90, -90, -90, -90 -.short -271, -271, -271, -271, -271, -271, -271, -271 -.short 830, 830, 830, 830, 830, 830, 830, 830 -.short 107, 107, 107, 107, 107, 107, 107, 107 -.short -1421, -1421, -1421, -1421, -1421, -1421, -1421, -1421 -.short -247, -247, -247, -247, -247, -247, -247, -247 -.short -951, -951, -951, -951, -951, -951, -951, -951 -.short -398, -398, -398, -398, -398, -398, -398, -398 -.short 961, 961, 961, 961, 961, 961, 961, 961 -.short -1508, -1508, -1508, -1508, -1508, -1508, -1508, -1508 -.short -725, -725, -725, -725, -725, -725, -725, -725 -.short 448, 448, 448, 448, 448, 448, 448, 448 -.short -1065, -1065, -1065, -1065, -1065, -1065, -1065, -1065 -.short 677, 677, 677, 677, 677, 677, 677, 677 -.short -1275, -1275, -1275, -1275, -1275, -1275, -1275, -1275 -.K64: -.short -1103, -1103, -1103, -1103, 430, 430, 430, 430 -.short 555, 555, 555, 555, 843, 843, 843, 843 -.short -1251, -1251, -1251, -1251, 871, 871, 871, 871 -.short 1550, 1550, 1550, 1550, 105, 105, 105, 105 -.short 422, 422, 422, 422, 587, 587, 587, 587 -.short 177, 177, 177, 177, -235, -235, -235, -235 -.short -291, -291, -291, -291, -460, -460, -460, -460 -.short 1574, 1574, 1574, 1574, 1653, 1653, 1653, 1653 -.short -246, -246, -246, -246, 778, 778, 778, 778 -.short 1159, 1159, 1159, 1159, -147, -147, -147, -147 -.short -777, -777, -777, -777, 1483, 1483, 1483, 1483 -.short -602, -602, -602, -602, 1119, 1119, 1119, 1119 -.short -1590, -1590, -1590, -1590, 644, 644, 644, 644 -.short -872, -872, -872, -872, 349, 349, 349, 349 -.short 418, 418, 418, 418, 329, 329, 329, 329 -.short -156, -156, -156, -156, -75, -75, -75, -75 -.short 817, 817, 817, 817, 1097, 1097, 1097, 1097 -.short 603, 603, 603, 603, 610, 610, 610, 610 -.short 1322, 1322, 1322, 1322, -1285, -1285, -1285, -1285 -.short -1465, -1465, -1465, -1465, 384, 384, 384, 384 -.short -1215, -1215, -1215, -1215, -136, -136, -136, -136 -.short 1218, 1218, 1218, 1218, -1335, -1335, -1335, -1335 -.short -874, -874, -874, -874, 220, 220, 220, 220 -.short -1187, -1187, -1187, -1187, -1659, -1659, -1659, -1659 -.short -1185, -1185, -1185, -1185, -1530, -1530, -1530, -1530 -.short -1278, -1278, -1278, -1278, 794, 794, 794, 794 -.short -1510, -1510, -1510, -1510, -854, -854, -854, -854 -.short -870, -870, -870, -870, 478, 478, 478, 478 -.short -108, -108, -108, -108, -308, -308, -308, -308 -.short 996, 996, 996, 996, 991, 991, 991, 991 -.short 958, 958, 958, 958, -1460, -1460, -1460, -1460 -.short 1522, 1522, 1522, 1522, 1628, 1628, 1628, 1628 +/* To facilitate single-compilation-unit (SCU) builds, undefine all macros. + * Don't modify by hand -- this is auto-generated by scripts/autogen. */ +#undef V_QINV +#undef V_NMKQ +#undef V_ZETA + +#endif /* MLK_ARITH_BACKEND_PPC64LE_DEFAULT && \ + !MLK_CONFIG_MULTILEVEL_NO_SHARED */ + +/* To facilitate single-compilation-unit (SCU) builds, undefine all macros. + * Don't modify by hand -- this is auto-generated by scripts/autogen. */ +#undef V_QINV +#undef V_NMKQ +#undef V_ZETA diff --git a/mlkem/src/native/ppc64le/src/poly_tomont.S b/mlkem/src/native/ppc64le/src/poly_tomont.S index c07f25c5a..eb770a631 100644 --- a/mlkem/src/native/ppc64le/src/poly_tomont.S +++ b/mlkem/src/native/ppc64le/src/poly_tomont.S @@ -17,8 +17,12 @@ # #include "../../../common.h" +#if defined(MLK_ARITH_BACKEND_PPC64LE_DEFAULT) && \ + !defined(MLK_CONFIG_MULTILEVEL_NO_SHARED) -#define V1353 0 +#include "consts.h" + +#define V1353 0 #define V_QINV 2 #define V_NMKQ 5 @@ -98,14 +102,9 @@ MLK_ASM_FN_SYMBOL(poly_tomont_ppc) stxv 32+29, 272(1) stxv 32+30, 288(1) - addis 9,2,.nmkq@toc@ha - addi 9,9,.nmkq@toc@l - addis 10,2,.C1353@toc@ha - addi 10,10,.C1353@toc@l - - lxv 32+V_NMKQ,0(9) - lxv 32+V_QINV,16(9) - lxv 32+V1353,0(10) + lxv 32+V_NMKQ, NQ_OFFSET(4) + lxv 32+V_QINV, QINV_OFFSET(4) + lxv 32+V1353, C1353_OFFSET(4) vxor 3, 3, 3 vspltish 4, 1 @@ -150,14 +149,17 @@ MLK_ASM_FN_SYMBOL(poly_tomont_ppc) addi 1, 1, 320 blr -.data -.align 4 -# -MLKEM_Q -.nmkq: -.short -3329, -3329, -3329, -3329, -3329, -3329, -3329, -3329 -# QINV -.short -3327, -3327, -3327, -3327, -3327, -3327, -3327, -3327 +/* To facilitate single-compilation-unit (SCU) builds, undefine all macros. + * Don't modify by hand -- this is auto-generated by scripts/autogen. */ +#undef V1353 +#undef V_QINV +#undef V_NMKQ -.C1353: -.short 1353, 1353, 1353, 1353, 1353, 1353, 1353, 1353 +#endif /* MLK_ARITH_BACKEND_PPC64LE_DEFAULT && \ + !MLK_CONFIG_MULTILEVEL_NO_SHARED */ +/* To facilitate single-compilation-unit (SCU) builds, undefine all macros. + * Don't modify by hand -- this is auto-generated by scripts/autogen. */ +#undef V1353 +#undef V_QINV +#undef V_NMKQ diff --git a/mlkem/src/native/ppc64le/src/reduce.S b/mlkem/src/native/ppc64le/src/reduce.S index ee8e1fdca..558410955 100644 --- a/mlkem/src/native/ppc64le/src/reduce.S +++ b/mlkem/src/native/ppc64le/src/reduce.S @@ -18,10 +18,14 @@ # #include "../../../common.h" +#if defined(MLK_ARITH_BACKEND_PPC64LE_DEFAULT) && \ + !defined(MLK_CONFIG_MULTILEVEL_NO_SHARED) + +#include "consts.h" # Barrett reduce constatnts #define V20159 0 -#define V_25 1 +#define V_25 1 #define V_26 2 #define V_MKQ 3 @@ -136,18 +140,16 @@ MLK_ASM_FN_SYMBOL(reduce_ppc) stxv 32+23, 176(1) stxv 32+24, 192(1) - addis 8,2,.mkq@toc@ha - addi 8,8,.mkq@toc@l - addis 9,2,.C20159@toc@ha - addi 9,9,.C20159@toc@l - addis 10,2,.C25@toc@ha - addi 10,10,.C25@toc@l - vxor 7, 7, 7 - lxv 32+V_MKQ, 0(8) - lxv 32+V20159, 0(9) - lxv 32+V_25, 0(10) + lxv 32+V_MKQ, Q_OFFSET(4) + lxv 32+V20159, C20159_OFFSET(4) + + vspltisw V_26, 13 + vadduwm V_26, V_26, V_26 + vspltisw 4, 1 + vsubuwm 5, V_26, 4 + vslw V_25, 4, 5 li 4, -128 li 5, -112 @@ -162,9 +164,6 @@ MLK_ASM_FN_SYMBOL(reduce_ppc) li 15, 32 li 16, 48 - vspltisw V_26, 13 - vadduwm V_26, V_26, V_26 - BREDUCE_4X 21, 22, 23, 24 BREDUCE_4X 4, 9, 13, 17 Write_8X @@ -211,15 +210,19 @@ MLK_ASM_FN_SYMBOL(reduce_ppc) addi 1, 1, 224 blr -.align 4 -.data -# MLKEM_Q -.mkq: -.short 3329, 3329, 3329, 3329, 3329, 3329, 3329, 3329 - -.C20159: -.short 20159, 20159, 20159, 20159, 20159, 20159, 20159, 20159 - -# 0x2000000 -.C25: -.long 33554432, 33554432, 33554432, 33554432 +/* To facilitate single-compilation-unit (SCU) builds, undefine all macros. + * Don't modify by hand -- this is auto-generated by scripts/autogen. */ +#undef V20159 +#undef V_25 +#undef V_26 +#undef V_MKQ + +#endif /* MLK_ARITH_BACKEND_PPC64LE_DEFAULT && \ + !MLK_CONFIG_MULTILEVEL_NO_SHARED */ + +/* To facilitate single-compilation-unit (SCU) builds, undefine all macros. + * Don't modify by hand -- this is auto-generated by scripts/autogen. */ +#undef V20159 +#undef V_25 +#undef V_26 +#undef V_MKQ From f3a7d3cb890a4efa169fe1b7316b69e7752c7b01 Mon Sep 17 00:00:00 2001 From: willieyz Date: Wed, 10 Sep 2025 16:51:26 +0800 Subject: [PATCH 08/16] Add MLK_CONFIG_NO_RANDOMIZED_API with default not set Signed-off-by: willieyz Signed-off-by: Danny Tsen --- mlkem/src/config.h | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) diff --git a/mlkem/src/config.h b/mlkem/src/config.h index 53b936647..8047ec0d4 100644 --- a/mlkem/src/config.h +++ b/mlkem/src/config.h @@ -451,6 +451,24 @@ *****************************************************************************/ /* #define MLK_CONFIG_NO_ASM */ +/****************************************************************************** + * Name: MLK_CONFIG_NO_RANDOMIZED_API + * + * Description: If this option is set, mlkem-native will be built without the + * randomized API functions (crypto_kem_keypair and + * crypto_kem_enc). + *. This allows users to build mlkem-native without providing a + * randombytes() implementation if they only need the + * deterministic API + * (crypto_kem_keypair_derand, crypto_kem_enc_derand, + * crypto_kem_dec). + * + * NOTE: This option is incompatible with MLK_CONFIG_KEYGEN_PCT + * as the current PCT implementation requires crypto_kem_enc(). + * + *****************************************************************************/ +/* #define MLK_CONFIG_NO_RANDOMIZED_API */ + /****************************************************************************** * Name: MLK_CONFIG_KEYGEN_PCT * From 25d3218cfa1fb62648b8ce82c5d3c2a2fbfc52cb Mon Sep 17 00:00:00 2001 From: willieyz Date: Wed, 10 Sep 2025 17:55:41 +0800 Subject: [PATCH 09/16] Guard the `crypto_kem_keypair` and `crypto_kem_enc` from: - `kem.c` - `randombytes.h` - `mlkem_native.h` using `MLK_CONFIG_NO_RANDOMIZED_API` Also, add a check in `common.h` to ensure `MLK_CONFIG_NO_RANDOMIZED_API` is not used together with `MLK_CONFIG_KEYGEN_PCT` Signed-off-by: willieyz Signed-off-by: Danny Tsen --- mlkem/mlkem_native.h | 4 ++++ mlkem/src/common.h | 4 ++++ mlkem/src/kem.c | 4 ++++ mlkem/src/randombytes.h | 3 ++- 4 files changed, 14 insertions(+), 1 deletion(-) diff --git a/mlkem/mlkem_native.h b/mlkem/mlkem_native.h index dec00e684..e6d386222 100644 --- a/mlkem/mlkem_native.h +++ b/mlkem/mlkem_native.h @@ -155,6 +155,7 @@ int MLK_API_NAMESPACE(keypair_derand)( uint8_t sk[MLKEM_SECRETKEYBYTES(MLK_CONFIG_API_PARAMETER_SET)], const uint8_t coins[2 * MLKEM_SYMBYTES]); +#if !defined(MLK_CONFIG_NO_RANDOMIZED_API) /************************************************* * Name: crypto_kem_keypair * @@ -176,6 +177,7 @@ MLK_API_MUST_CHECK_RETURN_VALUE int MLK_API_NAMESPACE(keypair)( uint8_t pk[MLKEM_PUBLICKEYBYTES(MLK_CONFIG_API_PARAMETER_SET)], uint8_t sk[MLKEM_SECRETKEYBYTES(MLK_CONFIG_API_PARAMETER_SET)]); +#endif /* !MLK_CONFIG_NO_RANDOMIZED_API */ /************************************************* * Name: crypto_kem_enc_derand @@ -206,6 +208,7 @@ int MLK_API_NAMESPACE(enc_derand)( const uint8_t pk[MLKEM_PUBLICKEYBYTES(MLK_CONFIG_API_PARAMETER_SET)], const uint8_t coins[MLKEM_SYMBYTES]); +#if !defined(MLK_CONFIG_NO_RANDOMIZED_API) /************************************************* * Name: crypto_kem_enc * @@ -231,6 +234,7 @@ int MLK_API_NAMESPACE(enc)( uint8_t ct[MLKEM_CIPHERTEXTBYTES(MLK_CONFIG_API_PARAMETER_SET)], uint8_t ss[MLKEM_BYTES], const uint8_t pk[MLKEM_PUBLICKEYBYTES(MLK_CONFIG_API_PARAMETER_SET)]); +#endif /* !MLK_CONFIG_NO_RANDOMIZED_API */ /************************************************* * Name: crypto_kem_dec diff --git a/mlkem/src/common.h b/mlkem/src/common.h index 652ef6c7e..d03f3db7d 100644 --- a/mlkem/src/common.h +++ b/mlkem/src/common.h @@ -99,6 +99,10 @@ #error Bad configuration: MLK_CONFIG_USE_NATIVE_BACKEND_FIPS202 is set, but MLK_CONFIG_FIPS202_BACKEND_FILE is not. #endif +#if defined(MLK_CONFIG_NO_RANDOMIZED_API) && defined(MLK_CONFIG_KEYGEN_PCT) +#error Bad configuration: MLK_CONFIG_NO_RANDOMIZED_API is incompatible with MLK_CONFIG_KEYGEN_PCT as the current PCT implementation requires crypto_kem_enc() +#endif + #if defined(MLK_CONFIG_USE_NATIVE_BACKEND_ARITH) #include MLK_CONFIG_ARITH_BACKEND_FILE /* Include to enforce consistency of API and implementation, diff --git a/mlkem/src/kem.c b/mlkem/src/kem.c index 65099d847..01430e2c2 100644 --- a/mlkem/src/kem.c +++ b/mlkem/src/kem.c @@ -199,6 +199,7 @@ int crypto_kem_keypair_derand(uint8_t pk[MLKEM_INDCCA_PUBLICKEYBYTES], return 0; } +#if !defined(MLK_CONFIG_NO_RANDOMIZED_API) /* Reference: `crypto_kem_keypair()` in the reference implementation @[REF] * - We zeroize the stack buffer */ MLK_EXTERNAL_API @@ -219,6 +220,7 @@ int crypto_kem_keypair(uint8_t pk[MLKEM_INDCCA_PUBLICKEYBYTES], mlk_zeroize(coins, sizeof(coins)); return res; } +#endif /* !MLK_CONFIG_NO_RANDOMIZED_API */ /* Reference: `crypto_kem_enc_derand()` in the reference implementation @[REF] * - We include public key check @@ -258,6 +260,7 @@ int crypto_kem_enc_derand(uint8_t ct[MLKEM_INDCCA_CIPHERTEXTBYTES], return 0; } +#if !defined(MLK_CONFIG_NO_RANDOMIZED_API) /* Reference: `crypto_kem_enc()` in the reference implementation @[REF] * - We include stack buffer zeroization */ MLK_EXTERNAL_API @@ -278,6 +281,7 @@ int crypto_kem_enc(uint8_t ct[MLKEM_INDCCA_CIPHERTEXTBYTES], mlk_zeroize(coins, sizeof(coins)); return res; } +#endif /* !MLK_CONFIG_NO_RANDOMIZED_API */ /* Reference: `crypto_kem_dec()` in the reference implementation @[REF] * - We include secret key check diff --git a/mlkem/src/randombytes.h b/mlkem/src/randombytes.h index 132d920af..1927afce2 100644 --- a/mlkem/src/randombytes.h +++ b/mlkem/src/randombytes.h @@ -11,6 +11,7 @@ #include "cbmc.h" #include "common.h" +#if !defined(MLK_CONFIG_NO_RANDOMIZED_API) #if !defined(MLK_CONFIG_CUSTOM_RANDOMBYTES) void randombytes(uint8_t *out, size_t outlen); static MLK_INLINE void mlk_randombytes(uint8_t *out, size_t outlen) @@ -18,5 +19,5 @@ __contract__( requires(memory_no_alias(out, outlen)) assigns(memory_slice(out, outlen))) { randombytes(out, outlen); } #endif /* !MLK_CONFIG_CUSTOM_RANDOMBYTES */ - +#endif /* !MLK_CONFIG_NO_RANDOMIZED_API */ #endif /* !MLK_RANDOMBYTES_H */ From b4d8771a875b28dfd20faf8fb0667e521a72f05a Mon Sep 17 00:00:00 2001 From: willieyz Date: Wed, 10 Sep 2025 19:47:44 +0800 Subject: [PATCH 10/16] Add new example `basic_deterministic` This commit: - Adds the `basic_deterministic` example demonstrating deterministic API usage (without a `randombytes()` implementation). - Uses only the `crypto_kem_*_derand` functions, no `randombytes()` required. - Updates the expected key outputs for deterministic entropy inputs (Alice: all 0 input, Bob: all 1 input). Signed-off-by: willieyz Signed-off-by: Danny Tsen --- .github/workflows/base.yml | 3 + BIBLIOGRAPHY.md | 1 + Makefile | 1 + examples/README.md | 3 + examples/basic_deterministic/.gitignore | 3 + examples/basic_deterministic/Makefile | 96 ++++ examples/basic_deterministic/README.md | 17 + examples/basic_deterministic/main.c | 109 ++++ .../custom_no_randomized_config.h | 531 ++++++++++++++++++ .../basic_deterministic/mlkem_native/mlkem | 1 + scripts/tests | 9 + 11 files changed, 774 insertions(+) create mode 100644 examples/basic_deterministic/.gitignore create mode 100644 examples/basic_deterministic/Makefile create mode 100644 examples/basic_deterministic/README.md create mode 100644 examples/basic_deterministic/main.c create mode 100644 examples/basic_deterministic/mlkem_native/custom_no_randomized_config.h create mode 120000 examples/basic_deterministic/mlkem_native/mlkem diff --git a/.github/workflows/base.yml b/.github/workflows/base.yml index 63e91a6a5..435f2fe5e 100644 --- a/.github/workflows/base.yml +++ b/.github/workflows/base.yml @@ -227,6 +227,9 @@ jobs: - name: basic run: | CFLAGS="-O0" make run -C examples/basic + - name: basic_deterministic + run: | + CFLAGS="-O0" make run -C examples/basic_deterministic - name: bring_your_own_fips202 run: | CFLAGS="-O0" make run -C examples/bring_your_own_fips202 diff --git a/BIBLIOGRAPHY.md b/BIBLIOGRAPHY.md index e8c0bca7b..d75d368ef 100644 --- a/BIBLIOGRAPHY.md +++ b/BIBLIOGRAPHY.md @@ -26,6 +26,7 @@ source code and documentation. - National Institute of Standards and Technology * URL: https://csrc.nist.gov/projects/cryptographic-module-validation-program/fips-140-3-ig-announcements * Referenced from: + - [examples/basic_deterministic/mlkem_native/custom_no_randomized_config.h](examples/basic_deterministic/mlkem_native/custom_no_randomized_config.h) - [integration/liboqs/config_aarch64.h](integration/liboqs/config_aarch64.h) - [integration/liboqs/config_c.h](integration/liboqs/config_c.h) - [integration/liboqs/config_ppc64le.h](integration/liboqs/config_ppc64le.h) diff --git a/Makefile b/Makefile index 058f0a7f4..2c4aa6438 100644 --- a/Makefile +++ b/Makefile @@ -218,6 +218,7 @@ clean: -make clean -C examples/bring_your_own_fips202 >/dev/null -make clean -C examples/custom_backend >/dev/null -make clean -C examples/basic >/dev/null + -make clean -C examples/basic_deterministic >/dev/null -make clean -C examples/monolithic_build >/dev/null -make clean -C examples/monolithic_build_native >/dev/null -make clean -C examples/monolithic_build_multilevel >/dev/null diff --git a/examples/README.md b/examples/README.md index cccfdeb90..65957ebc3 100644 --- a/examples/README.md +++ b/examples/README.md @@ -8,6 +8,9 @@ This directory contains minimal examples demonstrating how you can use mlkem-nat See [basic](basic) for a basic example of how to build a single instance of mlkem-native. +## Basic_deterministic + +See [basic_deterministic](basic_deterministic) for a basic example of how to build a single instance of mlkem-native without `randombytes()` implementation. This allows users to build mlkem-native using only the deterministic API when randomized functions are not required. ## Multi-level build (C only) See [multilevel_build](multilevel_build) for an example of how to build one instance of mlkem-native per security level, diff --git a/examples/basic_deterministic/.gitignore b/examples/basic_deterministic/.gitignore new file mode 100644 index 000000000..eb98a94f1 --- /dev/null +++ b/examples/basic_deterministic/.gitignore @@ -0,0 +1,3 @@ +# SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT + +build diff --git a/examples/basic_deterministic/Makefile b/examples/basic_deterministic/Makefile new file mode 100644 index 000000000..38ac80d55 --- /dev/null +++ b/examples/basic_deterministic/Makefile @@ -0,0 +1,96 @@ +# (SPDX-License-Identifier: CC-BY-4.0) + +.PHONY: build run clean size +.DEFAULT_GOAL := all + +# Append cross-prefix for cross compilation +# Remove or ignore for native builds +CC ?= gcc +SIZE ?= size +# When called from the root Makefile, CROSS_PREFIX has already been added here +ifeq (,$(findstring $(CROSS_PREFIX),$(CC))) +CC := $(CROSS_PREFIX)$(CC) +endif + +ifeq (,$(findstring $(CROSS_PREFIX),$(SIZE))) +SIZE := $(CROSS_PREFIX)$(SIZE) +endif + +# Part A: +# +# mlkem-native source and header files +# +# If you are not concerned about minimizing for a specific backend, +# you can just include _all_ source files into your build. +MLK_SOURCE=$(wildcard \ + mlkem_native/mlkem/src/*.c \ + mlkem_native/mlkem/src/**/*.c \ + mlkem_native/mlkem/src/**/**/*.c \ + mlkem_native/mlkem/src/**/**/**/*.c) + +# Part B: +# +# Your application source code +APP_SOURCE=$(wildcard *.c) + +ALL_SOURCE=$(MLK_SOURCE) $(RNG_SOURCE) $(APP_SOURCE) + +BUILD_DIR=build +BIN=test_binary + +CFLAGS := \ + -Wall \ + -Wextra \ + -Werror \ + -Wmissing-prototypes \ + -Wshadow \ + -Werror \ + -Wpointer-arith \ + -Wredundant-decls \ + -Wno-long-long \ + -Wno-unknown-pragmas \ + -Wno-unused-command-line-argument \ + -fomit-frame-pointer \ + -std=c99 \ + -pedantic \ + -MMD \ + -O3 \ + -Imlkem_native \ + $(CFLAGS) + +CFLAGS += -DMLK_CONFIG_NAMESPACE_PREFIX=mlkem +CFLAGS += -DMLK_CONFIG_FILE="\"custom_no_randomized_config.h\"" + +BINARY_NAME_FULL_512=$(BUILD_DIR)/$(BIN)512 +BINARY_NAME_FULL_768=$(BUILD_DIR)/$(BIN)768 +BINARY_NAME_FULL_1024=$(BUILD_DIR)/$(BIN)1024 +BINARIES_FULL=$(BINARY_NAME_FULL_512) $(BINARY_NAME_FULL_768) $(BINARY_NAME_FULL_1024) + +$(BINARY_NAME_FULL_512): CFLAGS += -DMLK_CONFIG_PARAMETER_SET=512 +$(BINARY_NAME_FULL_768): CFLAGS += -DMLK_CONFIG_PARAMETER_SET=768 +$(BINARY_NAME_FULL_1024): CFLAGS += -DMLK_CONFIG_PARAMETER_SET=1024 + +$(BINARIES_FULL): $(ALL_SOURCE) + echo "$@" + mkdir -p $(BUILD_DIR) + $(CC) $(CFLAGS) $^ -o $@ + +all: build size + +build: $(BINARIES_FULL) + +run: $(BINARIES_FULL) + $(EXEC_WRAPPER) ./$(BINARY_NAME_FULL_512) + $(EXEC_WRAPPER) ./$(BINARY_NAME_FULL_768) + $(EXEC_WRAPPER) ./$(BINARY_NAME_FULL_1024) + +size: build + @echo "=== Size info for $(BINARY_NAME_FULL_512) ===" + @$(SIZE) $(BINARY_NAME_FULL_512) + @echo "=== Size info for $(BINARY_NAME_FULL_768) ===" + @$(SIZE) $(BINARY_NAME_FULL_768) + @echo "=== Size info for $(BINARY_NAME_FULL_1024) ===" + @$(SIZE) $(BINARY_NAME_FULL_1024) + +clean: + rm -rf $(BUILD_DIR) diff --git a/examples/basic_deterministic/README.md b/examples/basic_deterministic/README.md new file mode 100644 index 000000000..465722080 --- /dev/null +++ b/examples/basic_deterministic/README.md @@ -0,0 +1,17 @@ +[//]: # (SPDX-License-Identifier: CC-BY-4.0) + +# Building mlkem-native + +This directory contains a minimal example showing how to build **mlkem-native** for use cases only requiring the deterministic key generation and encapsulation APIs (`crypto_kem_keypair_derand` and `crypto_kem_enc_derand`). In that case, no implementation of `randombytes()` has to be provided. + +## Components + +An application using mlkem-native as-is needs to include the following components: + +1. mlkem-native source tree, including [`mlkem/src/`](../../mlkem/src) and [`mlkem/src/fips202/`](../../mlkem/src/fips202). +2. The application source code + + +## Usage + +Build this example with `make build`, run with `make run`. diff --git a/examples/basic_deterministic/main.c b/examples/basic_deterministic/main.c new file mode 100644 index 000000000..05f23bc8a --- /dev/null +++ b/examples/basic_deterministic/main.c @@ -0,0 +1,109 @@ +/* + * Copyright (c) The mlkem-native project authors + * SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT + */ + +#include +#include + +/* Import public mlkem-native API + * + * This requires specifying the parameter set and namespace prefix + * used for the build. + */ +#define MLK_CONFIG_API_PARAMETER_SET MLK_CONFIG_PARAMETER_SET +#define MLK_CONFIG_API_NAMESPACE_PREFIX mlkem +#include "mlkem_native/mlkem/mlkem_native.h" + +/* No randombytes needed for deterministic API */ + +#define CHECK(x) \ + do \ + { \ + int rc; \ + rc = (x); \ + if (!rc) \ + { \ + fprintf(stderr, "ERROR (%s,%d)\n", __FILE__, __LINE__); \ + return 1; \ + } \ + } while (0) + +int main(void) +{ + uint8_t pk[CRYPTO_PUBLICKEYBYTES]; + uint8_t sk[CRYPTO_SECRETKEYBYTES]; + uint8_t ct[CRYPTO_CIPHERTEXTBYTES]; + uint8_t key_a[CRYPTO_BYTES]; + uint8_t key_b[CRYPTO_BYTES]; + uint8_t alice_en[2 * MLKEM_SYMBYTES] = {0}; + uint8_t bob_en[MLKEM_SYMBYTES] = {1}; + + + /* The PCT modifies the PRNG state, so the KAT tests don't work. + * We run KAT tests only for disabled PCT. + * Expected keys are generated using deterministic entropy: + * keypair uses all-zero entropy {0}, enc uses all-one entropy {1} */ +#if !defined(MLK_CONFIG_KEYGEN_PCT) +#if MLK_CONFIG_PARAMETER_SET == 512 + const uint8_t expected_key[] = { + 0x5f, 0x5f, 0x8c, 0xf5, 0x7c, 0x34, 0xd4, 0x68, 0x06, 0xa2, 0xe9, + 0xc9, 0x28, 0xba, 0x10, 0x5a, 0x46, 0xf2, 0x67, 0x1a, 0xc7, 0x81, + 0xdf, 0xf1, 0x4a, 0xbb, 0x27, 0xea, 0x46, 0x06, 0x46, 0x3c}; +#elif MLK_CONFIG_PARAMETER_SET == 768 + const uint8_t expected_key[] = { + 0x85, 0x21, 0xab, 0xc8, 0x14, 0xc7, 0x67, 0x70, 0x4f, 0xa6, 0x25, + 0xd9, 0x35, 0x95, 0xd0, 0x03, 0x79, 0xa8, 0xb3, 0x70, 0x35, 0x2c, + 0xa4, 0xba, 0xb3, 0xa6, 0x82, 0x46, 0x63, 0x0d, 0xb0, 0x8b}; +#elif MLK_CONFIG_PARAMETER_SET == 1024 + const uint8_t expected_key[] = { + 0x30, 0x4d, 0xbe, 0x54, 0xd6, 0x6f, 0x80, 0x66, 0xc6, 0xa8, 0x1c, + 0x6b, 0x36, 0xc4, 0x48, 0x9b, 0xf9, 0xe6, 0x05, 0x79, 0x83, 0x3c, + 0x4e, 0xdc, 0x8a, 0xc7, 0x92, 0xe5, 0x73, 0x0d, 0xdd, 0x85}; +#endif /* MLK_CONFIG_PARAMETER_SET == 1024 */ +#endif /* !MLK_CONFIG_KEYGEN_PCT */ + + /* No randombytes_reset() needed for deterministic API */ + + printf("Generating keypair ... "); + + /* Alice generates a public key using deterministic API with all-zero entropy + */ + CHECK(crypto_kem_keypair_derand(pk, sk, alice_en) == 0); + + printf("DONE\n"); + printf("Encaps... "); + + /* Bob derives a secret key and creates a response using deterministic API + * with all-one entropy */ + CHECK(crypto_kem_enc_derand(ct, key_b, pk, bob_en) == 0); + + printf("DONE\n"); + printf("Decaps... "); + + /* Alice uses Bobs response to get her shared key */ + CHECK(crypto_kem_dec(key_a, ct, sk) == 0); + + printf("DONE\n"); + printf("Compare... "); + + CHECK(memcmp(key_a, key_b, CRYPTO_BYTES) == 0); + + printf("Shared secret: "); + { + size_t i; + for (i = 0; i < sizeof(key_a); i++) + { + printf("%02x", key_a[i]); + } + } + printf("\n"); + + /* Check against hardcoded result to make sure that + * we integrated custom FIPS202 correctly */ + CHECK(memcmp(key_a, expected_key, CRYPTO_BYTES) == 0); + + + printf("OK\n"); + return 0; +} diff --git a/examples/basic_deterministic/mlkem_native/custom_no_randomized_config.h b/examples/basic_deterministic/mlkem_native/custom_no_randomized_config.h new file mode 100644 index 000000000..f1d4180ae --- /dev/null +++ b/examples/basic_deterministic/mlkem_native/custom_no_randomized_config.h @@ -0,0 +1,531 @@ +/* + * Copyright (c) The mlkem-native project authors + * SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT + */ + +/* References + * ========== + * + * - [FIPS140_3_IG] + * Implementation Guidance for FIPS 140-3 and the Cryptographic Module + * Validation Program National Institute of Standards and Technology + * https://csrc.nist.gov/projects/cryptographic-module-validation-program/fips-140-3-ig-announcements + */ + +#ifndef MLK_CONFIG_H +#define MLK_CONFIG_H + +/****************************************************************************** + * Name: MLK_CONFIG_PARAMETER_SET + * + * Description: Specifies the parameter set for ML-KEM + * - MLK_CONFIG_PARAMETER_SET=512 corresponds to ML-KEM-512 + * - MLK_CONFIG_PARAMETER_SET=768 corresponds to ML-KEM-768 + * - MLK_CONFIG_PARAMETER_SET=1024 corresponds to ML-KEM-1024 + * + * This can also be set using CFLAGS. + * + *****************************************************************************/ +#ifndef MLK_CONFIG_PARAMETER_SET +#define MLK_CONFIG_PARAMETER_SET \ + 768 /* Change this for different security strengths */ +#endif + +/****************************************************************************** + * Name: MLK_CONFIG_FILE + * + * Description: If defined, this is a header that will be included instead + * of this default configuration file mlkem/src/config.h. + * + * When you need to build mlkem-native in multiple configurations, + * using varying MLK_CONFIG_FILE can be more convenient + * then configuring everything through CFLAGS. + * + * To use, MLK_CONFIG_FILE _must_ be defined prior + * to the inclusion of any mlkem-native headers. For example, + * it can be set by passing `-DMLK_CONFIG_FILE="..."` + * on the command line. + * + *****************************************************************************/ +/* #define MLK_CONFIG_FILE "config.h" */ + +/****************************************************************************** + * Name: MLK_CONFIG_NAMESPACE_PREFIX + * + * Description: The prefix to use to namespace global symbols from mlkem/. + * + * In a multi-level build (that is, if either + * - MLK_CONFIG_MULTILEVEL_WITH_SHARED, or + * - MLK_CONFIG_MULTILEVEL_NO_SHARED, + * are set, level-dependent symbols will additionally be prefixed + * with the parameter set (512/768/1024). + * + * This can also be set using CFLAGS. + * + *****************************************************************************/ +#if !defined(MLK_CONFIG_NAMESPACE_PREFIX) +#define MLK_CONFIG_NAMESPACE_PREFIX MLK_DEFAULT_NAMESPACE_PREFIX +#endif + +/****************************************************************************** + * Name: MLK_CONFIG_MULTILEVEL_WITH_SHARED + * + * Description: This is for multi-level builds of mlkem-native only. If you + * need only a single parameter set, keep this unset. + * + * If this is set, all MLK_CONFIG_PARAMETER_SET-independent + * code will be included in the build, including code needed only + * for other parameter sets. + * + * Example: mlk_poly_cbd3 is only needed for + * MLK_CONFIG_PARAMETER_SET == 512. Yet, if this option is set + * for a build with MLK_CONFIG_PARAMETER_SET == 768/1024, it + * would be included. + * + * To build mlkem-native with support for all parameter sets, + * build it three times -- once per parameter set -- and set the + * option MLK_CONFIG_MULTILEVEL_WITH_SHARED for exactly one of + * them, and MLK_CONFIG_MULTILEVEL_NO_SHARED for the others. + * + * See examples/multilevel_build for an example. + * + * This can also be set using CFLAGS. + * + *****************************************************************************/ +/* #define MLK_CONFIG_MULTILEVEL_WITH_SHARED */ + +/****************************************************************************** + * Name: MLK_CONFIG_MULTILEVEL_NO_SHARED + * + * Description: This is for multi-level builds of mlkem-native only. If you + * need only a single parameter set, keep this unset. + * + * If this is set, no MLK_CONFIG_PARAMETER_SET-independent code + * will be included in the build. + * + * To build mlkem-native with support for all parameter sets, + * build it three times -- once per parameter set -- and set the + * option MLK_CONFIG_MULTILEVEL_WITH_SHARED for exactly one of + * them, and MLK_CONFIG_MULTILEVEL_NO_SHARED for the others. + * + * See examples/multilevel_build for an example. + * + * This can also be set using CFLAGS. + * + *****************************************************************************/ +/* #define MLK_CONFIG_MULTILEVEL_NO_SHARED */ + +/****************************************************************************** + * Name: MLK_CONFIG_MONOBUILD_KEEP_SHARED_HEADERS + * + * Description: This is only relevant for single compilation unit (SCU) + * builds of mlkem-native. In this case, it determines whether + * directives defined in parameter-set-independent headers should + * be #undef'ined or not at the of the SCU file. This is needed + * in multilevel builds. + * + * See examples/multilevel_build_native for an example. + * + * This can also be set using CFLAGS. + * + *****************************************************************************/ +/* #define MLK_CONFIG_MONOBUILD_KEEP_SHARED_HEADERS */ + +/****************************************************************************** + * Name: MLK_CONFIG_USE_NATIVE_BACKEND_ARITH + * + * Description: Determines whether an native arithmetic backend should be used. + * + * The arithmetic backend covers performance critical functions + * such as the number-theoretic transform (NTT). + * + * If this option is unset, the C backend will be used. + * + * If this option is set, the arithmetic backend to be use is + * determined by MLK_CONFIG_ARITH_BACKEND_FILE: If the latter is + * unset, the default backend for your the target architecture + * will be used. If set, it must be the name of a backend metadata + * file. + * + * This can also be set using CFLAGS. + * + *****************************************************************************/ +#if !defined(MLK_CONFIG_USE_NATIVE_BACKEND_ARITH) +/* #define MLK_CONFIG_USE_NATIVE_BACKEND_ARITH */ +#endif + +/****************************************************************************** + * Name: MLK_CONFIG_ARITH_BACKEND_FILE + * + * Description: The arithmetic backend to use. + * + * If MLK_CONFIG_USE_NATIVE_BACKEND_ARITH is unset, this option + * is ignored. + * + * If MLK_CONFIG_USE_NATIVE_BACKEND_ARITH is set, this option must + * either be undefined or the filename of an arithmetic backend. + * If unset, the default backend will be used. + * + * This can be set using CFLAGS. + * + *****************************************************************************/ +#if defined(MLK_CONFIG_USE_NATIVE_BACKEND_ARITH) && \ + !defined(MLK_CONFIG_ARITH_BACKEND_FILE) +#define MLK_CONFIG_ARITH_BACKEND_FILE "native/meta.h" +#endif + +/****************************************************************************** + * Name: MLK_CONFIG_USE_NATIVE_BACKEND_FIPS202 + * + * Description: Determines whether an native FIPS202 backend should be used. + * + * The FIPS202 backend covers 1x/2x/4x-fold Keccak-f1600, which is + * the performance bottleneck of SHA3 and SHAKE. + * + * If this option is unset, the C backend will be used. + * + * If this option is set, the FIPS202 backend to be use is + * determined by MLK_CONFIG_FIPS202_BACKEND_FILE: If the latter is + * unset, the default backend for your the target architecture + * will be used. If set, it must be the name of a backend metadata + * file. + * + * This can also be set using CFLAGS. + * + *****************************************************************************/ +#if !defined(MLK_CONFIG_USE_NATIVE_BACKEND_FIPS202) +/* #define MLK_CONFIG_USE_NATIVE_BACKEND_FIPS202 */ +#endif + +/****************************************************************************** + * Name: MLK_CONFIG_FIPS202_BACKEND_FILE + * + * Description: The FIPS-202 backend to use. + * + * If MLK_CONFIG_USE_NATIVE_BACKEND_FIPS202 is set, this option + * must either be undefined or the filename of a FIPS202 backend. + * If unset, the default backend will be used. + * + * This can be set using CFLAGS. + * + *****************************************************************************/ +#if defined(MLK_CONFIG_USE_NATIVE_BACKEND_FIPS202) && \ + !defined(MLK_CONFIG_FIPS202_BACKEND_FILE) +#define MLK_CONFIG_FIPS202_BACKEND_FILE "fips202/native/auto.h" +#endif + +/****************************************************************************** + * Name: MLK_CONFIG_FIPS202_CUSTOM_HEADER + * + * Description: Custom header to use for FIPS-202 + * + * This should only be set if you intend to use a custom + * FIPS-202 implementation, different from the one shipped + * with mlkem-native. + * + * If set, it must be the name of a file serving as the + * replacement for mlkem/fips202/fips202.h, and exposing + * the same API (see FIPS202.md). + * + *****************************************************************************/ +/* #define MLK_CONFIG_FIPS202_CUSTOM_HEADER "SOME_FILE.h" */ + +/****************************************************************************** + * Name: MLK_CONFIG_FIPS202X4_CUSTOM_HEADER + * + * Description: Custom header to use for FIPS-202-X4 + * + * This should only be set if you intend to use a custom + * FIPS-202 implementation, different from the one shipped + * with mlkem-native. + * + * If set, it must be the name of a file serving as the + * replacement for mlkem/fips202/fips202x4.h, and exposing + * the same API (see FIPS202.md). + * + *****************************************************************************/ +/* #define MLK_CONFIG_FIPS202X4_CUSTOM_HEADER "SOME_FILE.h" */ + +/****************************************************************************** + * Name: MLK_CONFIG_CUSTOM_ZEROIZE + * + * Description: In compliance with FIPS 203 Section 3.3, mlkem-native zeroizes + * intermediate stack buffers before returning from function calls. + * + * Set this option and define `mlk_zeroize` if you want to + * use a custom method to zeroize intermediate stack buffers. + * The default implementation uses SecureZeroMemory on Windows + * and a memset + compiler barrier otherwise. If neither of those + * is available on the target platform, compilation will fail, + * and you will need to use MLK_CONFIG_CUSTOM_ZEROIZE to provide + * a custom implementation of `mlk_zeroize()`. + * + * WARNING: + * The explicit stack zeroization conducted by mlkem-native + * reduces the likelihood of data leaking on the stack, but + * does not eliminate it! The C standard makes no guarantee about + * where a compiler allocates structures and whether/where it makes + * copies of them. Also, in addition to entire structures, there + * may also be potentially exploitable leakage of individual values + * on the stack. + * + * If you need bullet-proof zeroization of the stack, you need to + * consider additional measures instead of of what this feature + * provides. In this case, you can set mlk_zeroize to a no-op. + * + *****************************************************************************/ +/* #define MLK_CONFIG_CUSTOM_ZEROIZE + #if !defined(__ASSEMBLER__) + #include + #include "sys.h" + static MLK_INLINE void mlk_zeroize(void *ptr, size_t len) + { + ... your implementation ... + } + #endif +*/ + +/****************************************************************************** + * Name: MLK_CONFIG_CUSTOM_RANDOMBYTES + * + * Description: mlkem-native does not provide a secure randombytes + * implementation. Such an implementation has to provided by the + * consumer. + * + * If this option is not set, mlkem-native expects a function + * void randombytes(uint8_t *out, size_t outlen). + * + * Set this option and define `mlk_randombytes` if you want to + * use a custom method to sample randombytes with a different name + * or signature. + * + *****************************************************************************/ +/* #define MLK_CONFIG_CUSTOM_RANDOMBYTES + #if !defined(__ASSEMBLER__) + #include + #include "sys.h" + static MLK_INLINE void mlk_randombytes(uint8_t *ptr, size_t len) + { + ... your implementation ... + } + #endif +*/ + +/****************************************************************************** + * Name: MLK_CONFIG_CUSTOM_CAPABILITY_FUNC + * + * Description: mlkem-native backends may rely on specific hardware features. + * Those backends will only be included in an mlkem-native build + * if support for the respective features is enabled at + * compile-time. However, when building for a heteroneous set + * of CPUs to run the resulting binary/library on, feature + * detection at _runtime_ is needed to decided whether a backend + * can be used or not. + * + * Set this option and define `mlk_sys_check_capability` if you + * want to use a custom method to dispatch between implementations. + * + * If this option is not set, mlkem-native uses compile-time + * feature detection only to decide which backend to use. + * + * If you compile mlkem-native on a system with different + * capabilities than the system that the resulting binary/library + * will be run on, you must use this option. + * + *****************************************************************************/ +/* #define MLK_CONFIG_CUSTOM_CAPABILITY_FUNC + static MLK_INLINE int mlk_sys_check_capability(mlk_sys_cap cap) + __contract__( + ensures(return_value == 0 || return_value == 1) + ) + { + ... your implementation ... + } +*/ + +/****************************************************************************** + * Name: MLK_CONFIG_CUSTOM_MEMCPY + * + * Description: Set this option and define `mlk_memcpy` if you want to + * use a custom method to copy memory instead of the standard + * library memcpy function. + * + * The custom implementation must have the same signature and + * behavior as the standard memcpy function: + * void *mlk_memcpy(void *dest, const void *src, size_t n) + * + *****************************************************************************/ +/* #define MLK_CONFIG_CUSTOM_MEMCPY + #if !defined(__ASSEMBLER__) + #include + #include "sys.h" + static MLK_INLINE void *mlk_memcpy(void *dest, const void *src, size_t n) + { + ... your implementation ... + } + #endif +*/ + +/****************************************************************************** + * Name: MLK_CONFIG_CUSTOM_MEMSET + * + * Description: Set this option and define `mlk_memset` if you want to + * use a custom method to set memory instead of the standard + * library memset function. + * + * The custom implementation must have the same signature and + * behavior as the standard memset function: + * void *mlk_memset(void *s, int c, size_t n) + * + *****************************************************************************/ +/* #define MLK_CONFIG_CUSTOM_MEMSET + #if !defined(__ASSEMBLER__) + #include + #include "sys.h" + static MLK_INLINE void *mlk_memset(void *s, int c, size_t n) + { + ... your implementation ... + } + #endif +*/ + +/****************************************************************************** + * Name: MLK_CONFIG_INTERNAL_API_QUALIFIER + * + * Description: If set, this option provides an additional function + * qualifier to be added to declarations of internal API. + * + * The primary use case for this option are single-CU builds, + * in which case this option can be set to `static`. + * + *****************************************************************************/ +/* #define MLK_CONFIG_INTERNAL_API_QUALIFIER */ + +/****************************************************************************** + * Name: MLK_CONFIG_EXTERNAL_API_QUALIFIER + * + * Description: If set, this option provides an additional function + * qualifier to be added to declarations of mlkem-native's + * public API. + * + * The primary use case for this option are single-CU builds + * where the public API exposed by mlkem-native is wrapped by + * another API in the consuming application. In this case, + * even mlkem-native's public API can be marked `static`. + * + *****************************************************************************/ +/* #define MLK_CONFIG_EXTERNAL_API_QUALIFIER */ + +/****************************************************************************** + * Name: MLK_CONFIG_CT_TESTING_ENABLED + * + * Description: If set, mlkem-native annotates data as secret / public using + * valgrind's annotations VALGRIND_MAKE_MEM_UNDEFINED and + * VALGRIND_MAKE_MEM_DEFINED, enabling various checks for secret- + * dependent control flow of variable time execution (depending + * on the exact version of valgrind installed). + * + *****************************************************************************/ +/* #define MLK_CONFIG_CT_TESTING_ENABLED */ + +/****************************************************************************** + * Name: MLK_CONFIG_NO_ASM + * + * Description: If this option is set, mlkem-native will be built without + * use of native code or inline assembly. + * + * By default, inline assembly is used to implement value barriers. + * Without inline assembly, mlkem-native will use a global volatile + * 'opt blocker' instead; see verify.h. + * + * Inline assembly is also used to implement a secure zeroization + * function on non-Windows platforms. If this option is set and + * the target platform is not Windows, you MUST set + * MLK_CONFIG_CUSTOM_ZEROIZE and provide a custom zeroization + * function. + * + * If this option is set, MLK_CONFIG_USE_NATIVE_BACKEND_FIPS202 and + * and MLK_CONFIG_USE_NATIVE_BACKEND_ARITH will be ignored, and no + * native backends will be used. + * + *****************************************************************************/ +/* #define MLK_CONFIG_NO_ASM */ + +/****************************************************************************** + * Name: MLK_CONFIG_NO_RANDOMIZED_API + * + * Description: If this option is set, mlkem-native will be built without the + * randomized API functions (crypto_kem_keypair and + * crypto_kem_enc). + *. This allows users to build mlkem-native without providing a + * randombytes() implementation if they only need the + * deterministic API + * (crypto_kem_keypair_derand, crypto_kem_enc_derand, + * crypto_kem_dec). + * + * NOTE: This option is incompatible with MLK_CONFIG_KEYGEN_PCT + * as the current PCT implementation requires crypto_kem_enc(). + * + *****************************************************************************/ +#define MLK_CONFIG_NO_RANDOMIZED_API + +/****************************************************************************** + * Name: MLK_CONFIG_KEYGEN_PCT + * + * Description: Compliance with @[FIPS140_3_IG, p.87] requires a + * Pairwise Consistency Test (PCT) to be carried out on a freshly + * generated keypair before it can be exported. + * + * Set this option if such a check should be implemented. + * In this case, crypto_kem_keypair_derand and crypto_kem_keypair + * will return a non-zero error code if the PCT failed. + * + * NOTE: This feature will drastically lower the performance of + * key generation. + * + *****************************************************************************/ +/* #define MLK_CONFIG_KEYGEN_PCT */ + +/****************************************************************************** + * Name: MLK_CONFIG_KEYGEN_PCT_BREAKAGE_TEST + * + * Description: If this option is set, the user must provide a runtime + * function `static inline int mlk_break_pct() { ... }` to + * indicate whether the PCT should be made fail. + * + * This option only has an effect if MLK_CONFIG_KEYGEN_PCT is set. + * + *****************************************************************************/ +/* #define MLK_CONFIG_KEYGEN_PCT_BREAKAGE_TEST + #if !defined(__ASSEMBLER__) + #include "sys.h" + static MLK_INLINE int mlk_break_pct(void) + { + ... return 0/1 depending on whether PCT should be broken ... + } + #endif +*/ + +/************************* Config internals ********************************/ + +/* Default namespace + * + * Don't change this. If you need a different namespace, re-define + * MLK_CONFIG_NAMESPACE_PREFIX above instead, and remove the following. + * + * The default MLKEM namespace is + * + * PQCP_MLKEM_NATIVE_MLKEM_ + * + * e.g., PQCP_MLKEM_NATIVE_MLKEM512_ + */ + +#if MLK_CONFIG_PARAMETER_SET == 512 +#define MLK_DEFAULT_NAMESPACE_PREFIX PQCP_MLKEM_NATIVE_MLKEM512 +#elif MLK_CONFIG_PARAMETER_SET == 768 +#define MLK_DEFAULT_NAMESPACE_PREFIX PQCP_MLKEM_NATIVE_MLKEM768 +#elif MLK_CONFIG_PARAMETER_SET == 1024 +#define MLK_DEFAULT_NAMESPACE_PREFIX PQCP_MLKEM_NATIVE_MLKEM1024 +#endif + +#endif /* !MLK_CONFIG_H */ diff --git a/examples/basic_deterministic/mlkem_native/mlkem b/examples/basic_deterministic/mlkem_native/mlkem new file mode 120000 index 000000000..f4ec7bdb2 --- /dev/null +++ b/examples/basic_deterministic/mlkem_native/mlkem @@ -0,0 +1 @@ +../../../mlkem \ No newline at end of file diff --git a/scripts/tests b/scripts/tests index 40b35da7f..37b85cc17 100755 --- a/scripts/tests +++ b/scripts/tests @@ -208,6 +208,7 @@ class TEST_TYPES(Enum): MONOLITHIC_BUILD_NATIVE = 14 STACK = 15 SIZE = 16 + BASIC_DETERMINISTIC = 17 def is_benchmark(self): return self in [TEST_TYPES.BENCH, TEST_TYPES.BENCH_COMPONENTS] @@ -227,6 +228,7 @@ class TEST_TYPES(Enum): TEST_TYPES.MONOLITHIC_BUILD_MULTILEVEL_NATIVE, TEST_TYPES.MULTILEVEL_BUILD, TEST_TYPES.MULTILEVEL_BUILD_NATIVE, + TEST_TYPES.BASIC_DETERMINISTIC, ] @staticmethod @@ -260,6 +262,8 @@ class TEST_TYPES(Enum): return "Example (Custom Backend)" if self == TEST_TYPES.BASIC: return "Example (mlkem-native as code package)" + if self == TEST_TYPES.BASIC_DETERMINISTIC: + return "Example (mlkem-native as code package without randombytes() implementation)" if self == TEST_TYPES.MONOLITHIC_BUILD: return "Example (monobuild)" if self == TEST_TYPES.MONOLITHIC_BUILD_NATIVE: @@ -282,6 +286,8 @@ class TEST_TYPES(Enum): return "examples/custom_backend" if self == TEST_TYPES.BASIC: return "examples/basic" + if self == TEST_TYPES.BASIC_DETERMINISTIC: + return "examples/basic_deterministic" if self == TEST_TYPES.MONOLITHIC_BUILD: return "examples/monolithic_build" if self == TEST_TYPES.MONOLITHIC_BUILD_NATIVE: @@ -315,6 +321,8 @@ class TEST_TYPES(Enum): return "" if self == TEST_TYPES.BASIC: return "" + if self == TEST_TYPES.BASIC_DETERMINISTIC: + return "" if self == TEST_TYPES.MONOLITHIC_BUILD: return "" if self == TEST_TYPES.MONOLITHIC_BUILD_NATIVE: @@ -1104,6 +1112,7 @@ def cli(): "bring_your_own_fips202", "custom_backend", "basic", + "basic_deterministic", "monolithic_build", "monolithic_build_native", "monolithic_build_multilevel", From 95d003e6500f33bc09ccba24130831cf23d4950b Mon Sep 17 00:00:00 2001 From: willieyz Date: Thu, 11 Sep 2025 13:33:26 +0800 Subject: [PATCH 11/16] CBMC: Increase the CBMC_OBJECT_BITS of `matvec_mul` to 12 - During latest change about adding derandomized config guard, the CBMC proof for `matvec_mul` failed due to SMT-solver return unknown, increadse the CBMC_OBJECT_BITS to fixed it. Signed-off-by: willieyz Signed-off-by: Danny Tsen --- proofs/cbmc/matvec_mul/Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/proofs/cbmc/matvec_mul/Makefile b/proofs/cbmc/matvec_mul/Makefile index 65223be1c..65fc7e878 100644 --- a/proofs/cbmc/matvec_mul/Makefile +++ b/proofs/cbmc/matvec_mul/Makefile @@ -47,7 +47,7 @@ FUNCTION_NAME = mlk_matvec_mul # EXPENSIVE = true # This function is large enough to need... -CBMC_OBJECT_BITS = 10 +CBMC_OBJECT_BITS = 12 # If you require access to a file-local ("static") function or object to conduct # your proof, set the following (and do not include the original source file From 111fbd9592c3e4a7c974544ff96f3e582ddfa788 Mon Sep 17 00:00:00 2001 From: willieyz Date: Thu, 11 Sep 2025 11:03:19 +0800 Subject: [PATCH 12/16] Add `--exclude example` args for "PCT enabled" CI testing - Adds an option in the tests script to exclude specific examples. - Needed because basic_deterministic is incompatible with MLK_CONFIG_KEYGEN_PCT. - Allows CI to run all examples while skipping incompatible ones. Signed-off-by: willieyz Signed-off-by: Danny Tsen --- .github/actions/config-variations/action.yml | 2 ++ scripts/tests | 25 ++++++++++++++++++++ 2 files changed, 27 insertions(+) diff --git a/.github/actions/config-variations/action.yml b/.github/actions/config-variations/action.yml index c824d5d5f..15e0689f9 100644 --- a/.github/actions/config-variations/action.yml +++ b/.github/actions/config-variations/action.yml @@ -28,6 +28,8 @@ runs: kat: true acvp: true opt: ${{ inputs.opt }} + examples: true + extra_args: "--exclude-example basic_deterministic" - name: "PCT enabled + broken" if: ${{ inputs.tests == 'all' || contains(inputs.tests, 'pct-enabled-broken') }} shell: bash diff --git a/scripts/tests b/scripts/tests index 37b85cc17..c1fa2e5ba 100755 --- a/scripts/tests +++ b/scripts/tests @@ -636,6 +636,12 @@ class Tests: l = TEST_TYPES.examples() else: l = list(map(TEST_TYPES.from_string, self.args.l)) + + # Filter out excluded examples + if hasattr(self.args, "exclude_example") and self.args.exclude_example: + excluded = [TEST_TYPES.from_string(ex) for ex in self.args.exclude_example] + l = [e for e in l if e not in excluded] + for e in l: self._compile_schemes(e, None) self._run_scheme(e, None, None) @@ -1075,6 +1081,25 @@ def cli(): help="Do not run examples", ) + all_parser.add_argument( + "--exclude-example", + help="Exclude specific examples from running (can be used multiple times)", + choices=[ + "bring_your_own_fips202", + "custom_backend", + "basic", + "basic_deterministic", + "monolithic_build", + "monolithic_build_native", + "monolithic_build_multilevel", + "monolithic_build_multilevel_native", + "multilevel_build", + "multilevel_build_native", + ], + action="append", + default=[], + ) + stack_group = all_parser.add_mutually_exclusive_group() stack_group.add_argument( "--stack", From f768bb5394667fdd8c37a0befbbec53cfb8a06cd Mon Sep 17 00:00:00 2001 From: Rod Chapman Date: Tue, 16 Sep 2025 20:57:17 +0100 Subject: [PATCH 13/16] Introduce explicit upper bounds on lengths of input and output buffers where appropriate. Force CBMC to use --malloc-fail-assert for all proofs to remove assumption on buffer lengths. Update autogenerated files following this change. Update Proof Guide with notes on max buffer size Signed-off-by: Rod Chapman Signed-off-by: Danny Tsen --- mlkem/src/cbmc.h | 14 +++++++++++++- mlkem/src/fips202/fips202.c | 2 ++ mlkem/src/fips202/fips202.h | 5 +++++ mlkem/src/fips202/fips202x4.c | 4 +++- mlkem/src/fips202/fips202x4.h | 4 +++- mlkem/src/verify.h | 6 ++++-- proofs/cbmc/Makefile.common | 2 +- proofs/cbmc/proof_guide.md | 35 +++++++++++++++++++++++++++++++++++ 8 files changed, 66 insertions(+), 6 deletions(-) diff --git a/mlkem/src/cbmc.h b/mlkem/src/cbmc.h index da1f7f2d9..7cbd78036 100644 --- a/mlkem/src/cbmc.h +++ b/mlkem/src/cbmc.h @@ -8,7 +8,6 @@ /*************************************************** * Basic replacements for __CPROVER_XXX contracts ***************************************************/ - #ifndef CBMC #define __contract__(x) @@ -16,6 +15,8 @@ #else /* !CBMC */ +#include + #define __contract__(x) x #define __loop__(x) x @@ -59,6 +60,17 @@ #define readable(...) __CPROVER_r_ok(__VA_ARGS__) #define writeable(...) __CPROVER_w_ok(__VA_ARGS__) +/* Maximum supported buffer size + * + * Larger buffers may be supported, but due to internal modeling constraints + * in CBMC, the proofs of memory- and type-safety won't be able to run. + * + * If you find yourself in need for a buffer size larger than this, + * please contact the maintainers, so we can prioritize work to relax + * this somewhat artificial bound. + */ +#define MLK_MAX_BUFFER_SIZE (SIZE_MAX >> 12) + /* * History variables * https://diffblue.github.io/cbmc/contracts-history-variables.html diff --git a/mlkem/src/fips202/fips202.c b/mlkem/src/fips202/fips202.c index 811357439..06eda6f70 100644 --- a/mlkem/src/fips202/fips202.c +++ b/mlkem/src/fips202/fips202.c @@ -60,6 +60,7 @@ static void mlk_keccak_absorb_once(uint64_t *s, uint32_t r, const uint8_t *m, size_t mlen, uint8_t p) __contract__( + requires(mlen <= MLK_MAX_BUFFER_SIZE) requires(r <= sizeof(uint64_t) * MLK_KECCAK_LANES) requires(memory_no_alias(s, sizeof(uint64_t) * MLK_KECCAK_LANES)) requires(memory_no_alias(m, mlen)) @@ -153,6 +154,7 @@ __contract__( static void mlk_keccak_squeeze_once(uint8_t *h, size_t outlen, uint64_t *s, uint32_t r) __contract__( + requires(outlen <= MLK_MAX_BUFFER_SIZE) requires(r <= sizeof(uint64_t) * MLK_KECCAK_LANES) requires(memory_no_alias(s, sizeof(uint64_t) * MLK_KECCAK_LANES)) requires(memory_no_alias(h, outlen)) diff --git a/mlkem/src/fips202/fips202.h b/mlkem/src/fips202/fips202.h index fe27b341f..2334718e7 100644 --- a/mlkem/src/fips202/fips202.h +++ b/mlkem/src/fips202/fips202.h @@ -47,6 +47,7 @@ typedef struct void mlk_shake128_absorb_once(mlk_shake128ctx *state, const uint8_t *input, size_t inlen) __contract__( + requires(inlen <= MLK_MAX_BUFFER_SIZE) requires(memory_no_alias(state, sizeof(mlk_shake128ctx))) requires(memory_no_alias(input, inlen)) assigns(memory_slice(state, sizeof(mlk_shake128ctx))) @@ -96,6 +97,8 @@ void mlk_shake128_release(mlk_shake128ctx *state); void mlk_shake256(uint8_t *output, size_t outlen, const uint8_t *input, size_t inlen) __contract__( + requires(inlen <= MLK_MAX_BUFFER_SIZE) + requires(outlen <= MLK_MAX_BUFFER_SIZE) requires(memory_no_alias(input, inlen)) requires(memory_no_alias(output, outlen)) assigns(memory_slice(output, outlen)) @@ -116,6 +119,7 @@ __contract__( **************************************************/ void mlk_sha3_256(uint8_t *output, const uint8_t *input, size_t inlen) __contract__( + requires(inlen <= MLK_MAX_BUFFER_SIZE) requires(memory_no_alias(input, inlen)) requires(memory_no_alias(output, SHA3_256_HASHBYTES)) assigns(memory_slice(output, SHA3_256_HASHBYTES)) @@ -136,6 +140,7 @@ __contract__( **************************************************/ void mlk_sha3_512(uint8_t *output, const uint8_t *input, size_t inlen) __contract__( + requires(inlen <= MLK_MAX_BUFFER_SIZE) requires(memory_no_alias(input, inlen)) requires(memory_no_alias(output, SHA3_512_HASHBYTES)) assigns(memory_slice(output, SHA3_512_HASHBYTES)) diff --git a/mlkem/src/fips202/fips202x4.c b/mlkem/src/fips202/fips202x4.c index 34cfd0aa4..5608a2b8a 100644 --- a/mlkem/src/fips202/fips202x4.c +++ b/mlkem/src/fips202/fips202x4.c @@ -28,6 +28,7 @@ static void mlk_keccak_absorb_once_x4(uint64_t *s, uint32_t r, const uint8_t *in2, const uint8_t *in3, size_t inlen, uint8_t p) __contract__( + requires(inlen <= MLK_MAX_BUFFER_SIZE) requires(memory_no_alias(s, sizeof(uint64_t) * MLK_KECCAK_LANES * MLK_KECCAK_WAY)) requires(r <= sizeof(uint64_t) * MLK_KECCAK_LANES) requires(memory_no_alias(in0, inlen)) @@ -78,7 +79,8 @@ static void mlk_keccak_squeezeblocks_x4(uint8_t *out0, uint8_t *out1, size_t nblocks, uint64_t *s, uint32_t r) __contract__( requires(r <= sizeof(uint64_t) * MLK_KECCAK_LANES) - requires(nblocks <= 8 /* somewhat arbitrary bound */) + requires(r == SHAKE128_RATE || r == SHAKE256_RATE) + requires(nblocks <= (MLK_MAX_BUFFER_SIZE / SHAKE256_RATE)) requires(memory_no_alias(s, sizeof(uint64_t) * MLK_KECCAK_LANES * MLK_KECCAK_WAY)) requires(memory_no_alias(out0, nblocks * r)) requires(memory_no_alias(out1, nblocks * r)) diff --git a/mlkem/src/fips202/fips202x4.h b/mlkem/src/fips202/fips202x4.h index 76741d2e3..d4f285e23 100644 --- a/mlkem/src/fips202/fips202x4.h +++ b/mlkem/src/fips202/fips202x4.h @@ -25,6 +25,7 @@ void mlk_shake128x4_absorb_once(mlk_shake128x4ctx *state, const uint8_t *in0, const uint8_t *in1, const uint8_t *in2, const uint8_t *in3, size_t inlen) __contract__( + requires(inlen <= MLK_MAX_BUFFER_SIZE) requires(memory_no_alias(state, sizeof(mlk_shake128x4ctx))) requires(memory_no_alias(in0, inlen)) requires(memory_no_alias(in1, inlen)) @@ -62,7 +63,8 @@ void mlk_shake256x4(uint8_t *out0, uint8_t *out1, uint8_t *out2, uint8_t *out3, size_t outlen, uint8_t *in0, uint8_t *in1, uint8_t *in2, uint8_t *in3, size_t inlen) __contract__( - requires(outlen <= 8 * SHAKE256_RATE /* somewhat arbitrary bound */) + requires(inlen <= MLK_MAX_BUFFER_SIZE) + requires(outlen <= MLK_MAX_BUFFER_SIZE) requires(memory_no_alias(in0, inlen)) requires(memory_no_alias(in1, inlen)) requires(memory_no_alias(in2, inlen)) diff --git a/mlkem/src/verify.h b/mlkem/src/verify.h index 89eac7678..c51495248 100644 --- a/mlkem/src/verify.h +++ b/mlkem/src/verify.h @@ -318,7 +318,8 @@ __contract__(ensures(return_value == (cond ? a : b))) * * Arguments: const uint8_t *a: pointer to first byte array * const uint8_t *b: pointer to second byte array - * size_t len: length of the byte arrays + * size_t len: length of the byte arrays, upper-bounded + * to INT_MAX to control proof complexity * * Returns 0 if the byte arrays are equal, a non-zero value otherwise * @@ -338,9 +339,9 @@ __contract__(ensures(return_value == (cond ? a : b))) static MLK_INLINE uint8_t mlk_ct_memcmp(const uint8_t *a, const uint8_t *b, const size_t len) __contract__( + requires(len <= INT_MAX) requires(memory_no_alias(a, len)) requires(memory_no_alias(b, len)) - requires(len <= INT_MAX) ensures((return_value == 0) == forall(i, 0, len, (a[i] == b[i])))) { uint8_t r = 0, s = 0; @@ -391,6 +392,7 @@ __contract__( static MLK_INLINE void mlk_ct_cmov_zero(uint8_t *r, const uint8_t *x, size_t len, uint8_t b) __contract__( + requires(len <= MLK_MAX_BUFFER_SIZE) requires(memory_no_alias(r, len)) requires(memory_no_alias(x, len)) assigns(memory_slice(r, len))) diff --git a/proofs/cbmc/Makefile.common b/proofs/cbmc/Makefile.common index 0bfb3f0b5..cf9d61aed 100644 --- a/proofs/cbmc/Makefile.common +++ b/proofs/cbmc/Makefile.common @@ -246,7 +246,7 @@ endif # * an entire project when added to Makefile-project-defines # * a specific proof when added to the harness Makefile -CBMC_FLAG_MALLOC_MAY_FAIL ?= # set to --no-malloc-may-fail to disable +CBMC_FLAG_MALLOC_MAY_FAIL ?= --malloc-fail-assert CBMC_FLAG_BOUNDS_CHECK ?= # set to --no-bounds-check to disable CBMC_FLAG_CONVERSION_CHECK ?= --conversion-check CBMC_FLAG_DIV_BY_ZERO_CHECK ?= # set to --no-div-by-zero-check to disable diff --git a/proofs/cbmc/proof_guide.md b/proofs/cbmc/proof_guide.md index 4253fecae..e61a4b38c 100644 --- a/proofs/cbmc/proof_guide.md +++ b/proofs/cbmc/proof_guide.md @@ -73,6 +73,41 @@ for some struct `foo`, you cannot pass `&foo[0]`, `&foo[1]` as arguments to a fu `memory_no_alias(...)` for both, because `&foo[0]`, `&foo[1]` point to the same object. In mlkem-native, we sometimes work around this by manually splitting statically-sized arrays into multiple separate objects. + +### Maximum buffer sizes + +CBMC assumes that allocated objects are less than `__CPROVER_max_malloc_size` +which is an an internal constant defined to be `SIZE_MAX >> (OBJECT_BITS + 1)` +for that particular run of CBMC, where `SIZE_MAX` is an implementation-defined +constant (declared in `stdint.h`) and `OBJECT_BITS` is a command-line parameter +with value typically in the range 8 .. 12 + +See the [memory bounds checking](https://diffblue.github.io/cbmc/memory-bounds-checking.html) +section of the CBMC manual for more details. + +Pragmatically, `SIZE_MAX` will either be `2**64-1` or `2**32-1` depending on the +host platform, and we choose the largest value of `OBJECT_BITS` that is used +for all proofs in this repository. + +This matters where a function takes a formal parameter `p` of some pointer type +`t` and a `len` parameter of type `size_t` that denotes the number of elements +pointed to by `p`, and those parameters are subject to a +`memory_no_alias(p, len * sizeof(t))` contract. + +In such cases, len must be explicitly bounded to be less that or equal to +MLK_MAX_BUFFER_SIZE which might be defined in `cbmc.h` as: +```c +#define MLK_MAX_BUFFER_SIZE (SIZE_MAX >> 12) +``` +and used, for example, as follows: +```c +void f(t *p, size_t len) +__contract__( + requires(len * sizeof(t) <= MLK_MAX_BUFFER_SIZE) + requires(memory_no_alias(p, len * sizeof(t))) +); +``` + ### Memory footprint The most common way to specify memory footprint in `assigns(...)` clauses is via `memory_slice(ptr, len)`. This asserts From 52b9c78bbc6c4571c4d3580eb624cc0a20376c9c Mon Sep 17 00:00:00 2001 From: Danny Tsen Date: Mon, 22 Sep 2025 03:17:14 -0400 Subject: [PATCH 14/16] Removed unused comment lines. Signed-off-by: Danny Tsen --- dev/ppc64le/src/intt_ppc.S | 7 ------- dev/ppc64le/src/ntt_ppc.S | 5 ----- mlkem/src/native/ppc64le/src/intt_ppc.S | 7 ------- mlkem/src/native/ppc64le/src/ntt_ppc.S | 5 ----- 4 files changed, 24 deletions(-) diff --git a/dev/ppc64le/src/intt_ppc.S b/dev/ppc64le/src/intt_ppc.S index 1f4b48e42..b3ffe2f31 100644 --- a/dev/ppc64le/src/intt_ppc.S +++ b/dev/ppc64le/src/intt_ppc.S @@ -303,7 +303,6 @@ MLK_ASM_FN_SYMBOL(intt_ppc) xxlor 7, 32+9, 32+9 # V_25 syore at vs7 .align 4 -#__Len2: # # 1. len = 2, start = 0, 4, 8, 12,...244, 248, 252 # Update zetas vectors, each vector has 2 zetas @@ -340,7 +339,6 @@ intt_ppc__Loop2: bdnz intt_ppc__Loop2 .align 4 -#__Len4: # # 2. len = 4, start = 0, 8, 16, 24,...232, 240, 248 addi 14, 4, IZETA_NTT_OFFSET63 @@ -375,7 +373,6 @@ intt_ppc__Loop4: bdnz intt_ppc__Loop4 .align 4 -#__Len8: # 3. len = 8, start = 0, 16, 32, 48,...208, 224, 240 #addi 14, 14, 512 li 7, 16 @@ -417,7 +414,6 @@ intt_ppc__Loop4: Write_M4C 32+13, 32+18, 32+23, 32+28 .align 4 -#__Len16: # # 4. len = 16, start = 0, 32, 64,,...160, 192, 224 #addi 14, 14, 768 @@ -461,7 +457,6 @@ intt_ppc__Loop4: Write_M4C 32+13, 32+18, 32+23, 32+28 .align 4 -#__Len32: # # 5. len = 32, start = 0, 64, 128, 192 #addi 14, 14, 896 @@ -508,7 +503,6 @@ intt_ppc__Loop4: Write_M4C 32+13, 32+18, 32+23, 32+28 .align 4 -#__Len64: # # 6. len = 64, start = 0, 128 #addi 14, 14, 960 @@ -552,7 +546,6 @@ intt_ppc__Loop4: Write_M4C 32+13, 32+18, 32+23, 32+28 .align 4 -#__Len128: # 7. len = 128, start = 0 # #addi 14, 14, 992 diff --git a/dev/ppc64le/src/ntt_ppc.S b/dev/ppc64le/src/ntt_ppc.S index 5bc1c34b8..0c98581c5 100644 --- a/dev/ppc64le/src/ntt_ppc.S +++ b/dev/ppc64le/src/ntt_ppc.S @@ -196,7 +196,6 @@ MLK_ASM_FN_SYMBOL(ntt_ppc) lxv 32+V_QINV, QINV_OFFSET(4) .align 4 -#__Len128: # # Compute coefficients of the NTT based on the following loop. # for (len = 128; len ≥ 2; len = len/2) @@ -221,7 +220,6 @@ MLK_ASM_FN_SYMBOL(ntt_ppc) Write_One .align 4 -#__Len64: # # 2. len = 64, start = 0, 128 # k += 2 @@ -245,7 +243,6 @@ MLK_ASM_FN_SYMBOL(ntt_ppc) Write_One .align 4 -#__Len32: # # 3. len = 32, start = 0, 64, 128, 192 # k += 4 @@ -278,7 +275,6 @@ MLK_ASM_FN_SYMBOL(ntt_ppc) Write_One .align 4 -#__Len16: # # 4. len = 16, start = 0, 32, 64,,...160, 192, 224 # k += 8 @@ -300,7 +296,6 @@ MLK_ASM_FN_SYMBOL(ntt_ppc) Write_One .align 4 -#__Len8: # # 5. len = 8, start = 0, 16, 32, 48,...208, 224, 240 # k += 16 diff --git a/mlkem/src/native/ppc64le/src/intt_ppc.S b/mlkem/src/native/ppc64le/src/intt_ppc.S index 1a4975ba0..163c3f806 100644 --- a/mlkem/src/native/ppc64le/src/intt_ppc.S +++ b/mlkem/src/native/ppc64le/src/intt_ppc.S @@ -302,7 +302,6 @@ MLK_ASM_FN_SYMBOL(intt_ppc) xxlor 7, 32+9, 32+9 # V_25 syore at vs7 .align 4 -#__Len2: # # 1. len = 2, start = 0, 4, 8, 12,...244, 248, 252 # Update zetas vectors, each vector has 2 zetas @@ -339,7 +338,6 @@ intt_ppc__Loop2: bdnz intt_ppc__Loop2 .align 4 -#__Len4: # # 2. len = 4, start = 0, 8, 16, 24,...232, 240, 248 addi 14, 4, IZETA_NTT_OFFSET63 @@ -374,7 +372,6 @@ intt_ppc__Loop4: bdnz intt_ppc__Loop4 .align 4 -#__Len8: # 3. len = 8, start = 0, 16, 32, 48,...208, 224, 240 #addi 14, 14, 512 li 7, 16 @@ -416,7 +413,6 @@ intt_ppc__Loop4: Write_M4C 32+13, 32+18, 32+23, 32+28 .align 4 -#__Len16: # # 4. len = 16, start = 0, 32, 64,,...160, 192, 224 #addi 14, 14, 768 @@ -460,7 +456,6 @@ intt_ppc__Loop4: Write_M4C 32+13, 32+18, 32+23, 32+28 .align 4 -#__Len32: # # 5. len = 32, start = 0, 64, 128, 192 #addi 14, 14, 896 @@ -507,7 +502,6 @@ intt_ppc__Loop4: Write_M4C 32+13, 32+18, 32+23, 32+28 .align 4 -#__Len64: # # 6. len = 64, start = 0, 128 #addi 14, 14, 960 @@ -551,7 +545,6 @@ intt_ppc__Loop4: Write_M4C 32+13, 32+18, 32+23, 32+28 .align 4 -#__Len128: # 7. len = 128, start = 0 # #addi 14, 14, 992 diff --git a/mlkem/src/native/ppc64le/src/ntt_ppc.S b/mlkem/src/native/ppc64le/src/ntt_ppc.S index e9a8df81f..83f42f9b8 100644 --- a/mlkem/src/native/ppc64le/src/ntt_ppc.S +++ b/mlkem/src/native/ppc64le/src/ntt_ppc.S @@ -195,7 +195,6 @@ MLK_ASM_FN_SYMBOL(ntt_ppc) lxv 32+V_QINV, QINV_OFFSET(4) .align 4 -#__Len128: # # Compute coefficients of the NTT based on the following loop. # for (len = 128; len ≥ 2; len = len/2) @@ -220,7 +219,6 @@ MLK_ASM_FN_SYMBOL(ntt_ppc) Write_One .align 4 -#__Len64: # # 2. len = 64, start = 0, 128 # k += 2 @@ -244,7 +242,6 @@ MLK_ASM_FN_SYMBOL(ntt_ppc) Write_One .align 4 -#__Len32: # # 3. len = 32, start = 0, 64, 128, 192 # k += 4 @@ -277,7 +274,6 @@ MLK_ASM_FN_SYMBOL(ntt_ppc) Write_One .align 4 -#__Len16: # # 4. len = 16, start = 0, 32, 64,,...160, 192, 224 # k += 8 @@ -299,7 +295,6 @@ MLK_ASM_FN_SYMBOL(ntt_ppc) Write_One .align 4 -#__Len8: # # 5. len = 8, start = 0, 16, 32, 48,...208, 224, 240 # k += 16 From 61abb93fe5b2ab573691c60b75e078617b0268b2 Mon Sep 17 00:00:00 2001 From: Danny Tsen Date: Mon, 22 Sep 2025 07:00:04 -0400 Subject: [PATCH 15/16] Removed un-wanted comment. Removed non-p8 instruction, xxspltib. Signed-off-by: Danny Tsen --- dev/ppc64le/src/intt_ppc.S | 1 - dev/ppc64le/src/reduce.S | 2 +- mlkem/src/native/ppc64le/src/intt_ppc.S | 1 - mlkem/src/native/ppc64le/src/reduce.S | 2 +- 4 files changed, 2 insertions(+), 4 deletions(-) diff --git a/dev/ppc64le/src/intt_ppc.S b/dev/ppc64le/src/intt_ppc.S index b3ffe2f31..95bf370b8 100644 --- a/dev/ppc64le/src/intt_ppc.S +++ b/dev/ppc64le/src/intt_ppc.S @@ -292,7 +292,6 @@ MLK_ASM_FN_SYMBOL(intt_ppc) lxv 32+V20159, C20159_OFFSET(4) # V20159 lxv 7, 0(4) # V_25 - #xxspltiw 8, 26 # for power9 and above vspltisw 8, 13 vadduwm 8, 8, 8 xxlor 8, 32+8, 32+8 # V_26 store at vs8 diff --git a/dev/ppc64le/src/reduce.S b/dev/ppc64le/src/reduce.S index dfb634392..603e0d38b 100644 --- a/dev/ppc64le/src/reduce.S +++ b/dev/ppc64le/src/reduce.S @@ -186,7 +186,7 @@ MLK_ASM_FN_SYMBOL(reduce_ppc) # .align 4 addi 3, 3, -512 - xxspltib 32+9 ,0 + vxor 9, 9, 9 vspltish 10, 15 vmr 11, V_MKQ diff --git a/mlkem/src/native/ppc64le/src/intt_ppc.S b/mlkem/src/native/ppc64le/src/intt_ppc.S index 163c3f806..817c8c299 100644 --- a/mlkem/src/native/ppc64le/src/intt_ppc.S +++ b/mlkem/src/native/ppc64le/src/intt_ppc.S @@ -291,7 +291,6 @@ MLK_ASM_FN_SYMBOL(intt_ppc) lxv 32+V20159, C20159_OFFSET(4) # V20159 lxv 7, 0(4) # V_25 - #xxspltiw 8, 26 # for power9 and above vspltisw 8, 13 vadduwm 8, 8, 8 xxlor 8, 32+8, 32+8 # V_26 store at vs8 diff --git a/mlkem/src/native/ppc64le/src/reduce.S b/mlkem/src/native/ppc64le/src/reduce.S index 558410955..f9681c456 100644 --- a/mlkem/src/native/ppc64le/src/reduce.S +++ b/mlkem/src/native/ppc64le/src/reduce.S @@ -185,7 +185,7 @@ MLK_ASM_FN_SYMBOL(reduce_ppc) # .align 4 addi 3, 3, -512 - xxspltib 32+9 ,0 + vxor 9, 9, 9 vspltish 10, 15 vmr 11, V_MKQ From 33858ecc8820c61f69a4ffa6a395df5cc22da2fb Mon Sep 17 00:00:00 2001 From: Danny Tsen Date: Wed, 8 Oct 2025 01:34:49 -0400 Subject: [PATCH 16/16] Fixed failure in INTT unit tests. Re-arranged zeta array for NTT/INTT for Len 2 and 4. Signed-off-by: Danny Tsen --- dev/ppc64le/src/consts.c | 191 ++++++------ dev/ppc64le/src/consts.h | 4 +- dev/ppc64le/src/intt_ppc.S | 371 ++++++++++++----------- dev/ppc64le/src/ntt_ppc.S | 225 +++++++++----- mlkem/src/native/ppc64le/src/consts.c | 191 ++++++------ mlkem/src/native/ppc64le/src/consts.h | 4 +- mlkem/src/native/ppc64le/src/intt_ppc.S | 386 ++++++++++++------------ mlkem/src/native/ppc64le/src/ntt_ppc.S | 231 ++++++++------ 8 files changed, 836 insertions(+), 767 deletions(-) diff --git a/dev/ppc64le/src/consts.c b/dev/ppc64le/src/consts.c index 4c2fbdf61..fa0f7097f 100644 --- a/dev/ppc64le/src/consts.c +++ b/dev/ppc64le/src/consts.c @@ -3,6 +3,11 @@ * SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT */ +#include +#include +#include +#include + #include "../../../common.h" #if defined(MLK_ARITH_BACKEND_PPC64LE_DEFAULT) && \ @@ -10,7 +15,7 @@ #include "consts.h" -MLK_ALIGN const int16_t mlk_ppc_qdata[1568] = { +MLK_ALIGN const int16_t mlk_ppc_qdata[1072] = { /* -Q */ -3329, -3329, -3329, -3329, -3329, -3329, -3329, -3329, /* QINV */ @@ -44,112 +49,84 @@ MLK_ALIGN const int16_t mlk_ppc_qdata[1568] = { 732, 732, 732, 732, 732, 732, 608, 608, 608, 608, 608, 608, 608, 608, -1542, -1542, -1542, -1542, -1542, -1542, -1542, -1542, 411, 411, 411, 411, 411, 411, 411, 411, -205, -205, -205, -205, -205, -205, -205, -205, -1571, -1571, - -1571, -1571, -1571, -1571, -1571, -1571, 1223, 1223, 1223, 1223, 1223, - 1223, 1223, 1223, 652, 652, 652, 652, 652, 652, 652, 652, -552, -552, -552, - -552, -552, -552, -552, -552, 1015, 1015, 1015, 1015, 1015, 1015, 1015, - 1015, -1293, -1293, -1293, -1293, -1293, -1293, -1293, -1293, 1491, 1491, - 1491, 1491, 1491, 1491, 1491, 1491, -282, -282, -282, -282, -282, -282, - -282, -282, -1544, -1544, -1544, -1544, -1544, -1544, -1544, -1544, 516, - 516, 516, 516, 516, 516, 516, 516, -8, -8, -8, -8, -8, -8, -8, -8, -320, - -320, -320, -320, -320, -320, -320, -320, -666, -666, -666, -666, -666, - -666, -666, -666, -1618, -1618, -1618, -1618, -1618, -1618, -1618, -1618, - -1162, -1162, -1162, -1162, -1162, -1162, -1162, -1162, 126, 126, 126, 126, - 126, 126, 126, 126, 1469, 1469, 1469, 1469, 1469, 1469, 1469, 1469, -853, - -853, -853, -853, -853, -853, -853, -853, -90, -90, -90, -90, -90, -90, -90, - -90, -271, -271, -271, -271, -271, -271, -271, -271, 830, 830, 830, 830, - 830, 830, 830, 830, 107, 107, 107, 107, 107, 107, 107, 107, -1421, -1421, - -1421, -1421, -1421, -1421, -1421, -1421, -247, -247, -247, -247, -247, - -247, -247, -247, -951, -951, -951, -951, -951, -951, -951, -951, -398, - -398, -398, -398, -398, -398, -398, -398, 961, 961, 961, 961, 961, 961, 961, - 961, -1508, -1508, -1508, -1508, -1508, -1508, -1508, -1508, -725, -725, - -725, -725, -725, -725, -725, -725, 448, 448, 448, 448, 448, 448, 448, 448, - -1065, -1065, -1065, -1065, -1065, -1065, -1065, -1065, 677, 677, 677, 677, - 677, 677, 677, 677, -1275, -1275, -1275, -1275, -1275, -1275, -1275, -1275, - /* For intt Len=2, offset IZETA_NTT_OFFSET64 */ - -1103, -1103, -1103, -1103, 430, 430, 430, 430, 555, 555, 555, 555, 843, - 843, 843, 843, -1251, -1251, -1251, -1251, 871, 871, 871, 871, 1550, 1550, - 1550, 1550, 105, 105, 105, 105, 422, 422, 422, 422, 587, 587, 587, 587, 177, - 177, 177, 177, -235, -235, -235, -235, -291, -291, -291, -291, -460, -460, - -460, -460, 1574, 1574, 1574, 1574, 1653, 1653, 1653, 1653, -246, -246, - -246, -246, 778, 778, 778, 778, 1159, 1159, 1159, 1159, -147, -147, -147, - -147, -777, -777, -777, -777, 1483, 1483, 1483, 1483, -602, -602, -602, - -602, 1119, 1119, 1119, 1119, -1590, -1590, -1590, -1590, 644, 644, 644, - 644, -872, -872, -872, -872, 349, 349, 349, 349, 418, 418, 418, 418, 329, - 329, 329, 329, -156, -156, -156, -156, -75, -75, -75, -75, 817, 817, 817, - 817, 1097, 1097, 1097, 1097, 603, 603, 603, 603, 610, 610, 610, 610, 1322, - 1322, 1322, 1322, -1285, -1285, -1285, -1285, -1465, -1465, -1465, -1465, - 384, 384, 384, 384, -1215, -1215, -1215, -1215, -136, -136, -136, -136, - 1218, 1218, 1218, 1218, -1335, -1335, -1335, -1335, -874, -874, -874, -874, - 220, 220, 220, 220, -1187, -1187, -1187, -1187, -1659, -1659, -1659, -1659, - -1185, -1185, -1185, -1185, -1530, -1530, -1530, -1530, -1278, -1278, -1278, - -1278, 794, 794, 794, 794, -1510, -1510, -1510, -1510, -854, -854, -854, - -854, -870, -870, -870, -870, 478, 478, 478, 478, -108, -108, -108, -108, - -308, -308, -308, -308, 996, 996, 996, 996, 991, 991, 991, 991, 958, 958, - 958, 958, -1460, -1460, -1460, -1460, 1522, 1522, 1522, 1522, 1628, 1628, - 1628, 1628, - /* For intt Len=2, offset IZETA_NTT_OFFSET127 */ - 1628, 1628, 1628, 1628, 1522, 1522, 1522, 1522, -1460, -1460, -1460, -1460, - 958, 958, 958, 958, 991, 991, 991, 991, 996, 996, 996, 996, -308, -308, - -308, -308, -108, -108, -108, -108, 478, 478, 478, 478, -870, -870, -870, - -870, -854, -854, -854, -854, -1510, -1510, -1510, -1510, 794, 794, 794, - 794, -1278, -1278, -1278, -1278, -1530, -1530, -1530, -1530, -1185, -1185, - -1185, -1185, -1659, -1659, -1659, -1659, -1187, -1187, -1187, -1187, 220, - 220, 220, 220, -874, -874, -874, -874, -1335, -1335, -1335, -1335, 1218, - 1218, 1218, 1218, -136, -136, -136, -136, -1215, -1215, -1215, -1215, 384, - 384, 384, 384, -1465, -1465, -1465, -1465, -1285, -1285, -1285, -1285, 1322, - 1322, 1322, 1322, 610, 610, 610, 610, 603, 603, 603, 603, 1097, 1097, 1097, - 1097, 817, 817, 817, 817, -75, -75, -75, -75, -156, -156, -156, -156, 329, - 329, 329, 329, 418, 418, 418, 418, 349, 349, 349, 349, -872, -872, -872, - -872, 644, 644, 644, 644, -1590, -1590, -1590, -1590, 1119, 1119, 1119, - 1119, -602, -602, -602, -602, 1483, 1483, 1483, 1483, -777, -777, -777, - -777, -147, -147, -147, -147, 1159, 1159, 1159, 1159, 778, 778, 778, 778, - -246, -246, -246, -246, 1653, 1653, 1653, 1653, 1574, 1574, 1574, 1574, - -460, -460, -460, -460, -291, -291, -291, -291, -235, -235, -235, -235, 177, - 177, 177, 177, 587, 587, 587, 587, 422, 422, 422, 422, 105, 105, 105, 105, - 1550, 1550, 1550, 1550, 871, 871, 871, 871, -1251, -1251, -1251, -1251, 843, - 843, 843, 843, 555, 555, 555, 555, 430, 430, 430, 430, -1103, -1103, -1103, - -1103, - /* For intt Len=4 and others, offset IZETA_NTT_OFFSET63 */ - -1275, -1275, -1275, -1275, -1275, -1275, -1275, -1275, 677, 677, 677, 677, - 677, 677, 677, 677, -1065, -1065, -1065, -1065, -1065, -1065, -1065, -1065, - 448, 448, 448, 448, 448, 448, 448, 448, -725, -725, -725, -725, -725, -725, - -725, -725, -1508, -1508, -1508, -1508, -1508, -1508, -1508, -1508, 961, - 961, 961, 961, 961, 961, 961, 961, -398, -398, -398, -398, -398, -398, -398, - -398, -951, -951, -951, -951, -951, -951, -951, -951, -247, -247, -247, - -247, -247, -247, -247, -247, -1421, -1421, -1421, -1421, -1421, -1421, - -1421, -1421, 107, 107, 107, 107, 107, 107, 107, 107, 830, 830, 830, 830, - 830, 830, 830, 830, -271, -271, -271, -271, -271, -271, -271, -271, -90, - -90, -90, -90, -90, -90, -90, -90, -853, -853, -853, -853, -853, -853, -853, - -853, 1469, 1469, 1469, 1469, 1469, 1469, 1469, 1469, 126, 126, 126, 126, - 126, 126, 126, 126, -1162, -1162, -1162, -1162, -1162, -1162, -1162, -1162, - -1618, -1618, -1618, -1618, -1618, -1618, -1618, -1618, -666, -666, -666, - -666, -666, -666, -666, -666, -320, -320, -320, -320, -320, -320, -320, - -320, -8, -8, -8, -8, -8, -8, -8, -8, 516, 516, 516, 516, 516, 516, 516, - 516, -1544, -1544, -1544, -1544, -1544, -1544, -1544, -1544, -282, -282, - -282, -282, -282, -282, -282, -282, 1491, 1491, 1491, 1491, 1491, 1491, - 1491, 1491, -1293, -1293, -1293, -1293, -1293, -1293, -1293, -1293, 1015, - 1015, 1015, 1015, 1015, 1015, 1015, 1015, -552, -552, -552, -552, -552, - -552, -552, -552, 652, 652, 652, 652, 652, 652, 652, 652, 1223, 1223, 1223, - 1223, 1223, 1223, 1223, 1223, -1571, -1571, -1571, -1571, -1571, -1571, - -1571, -1571, -205, -205, -205, -205, -205, -205, -205, -205, 411, 411, 411, - 411, 411, 411, 411, 411, -1542, -1542, -1542, -1542, -1542, -1542, -1542, - -1542, 608, 608, 608, 608, 608, 608, 608, 608, 732, 732, 732, 732, 732, 732, - 732, 732, 1017, 1017, 1017, 1017, 1017, 1017, 1017, 1017, -681, -681, -681, - -681, -681, -681, -681, -681, -130, -130, -130, -130, -130, -130, -130, - -130, -1602, -1602, -1602, -1602, -1602, -1602, -1602, -1602, 1458, 1458, - 1458, 1458, 1458, 1458, 1458, 1458, -829, -829, -829, -829, -829, -829, - -829, -829, 383, 383, 383, 383, 383, 383, 383, 383, 264, 264, 264, 264, 264, - 264, 264, 264, -1325, -1325, -1325, -1325, -1325, -1325, -1325, -1325, 573, - 573, 573, 573, 573, 573, 573, 573, 1468, 1468, 1468, 1468, 1468, 1468, 1468, - 1468, -1474, -1474, -1474, -1474, -1474, -1474, -1474, -1474, -1202, -1202, - -1202, -1202, -1202, -1202, -1202, -1202, 962, 962, 962, 962, 962, 962, 962, - 962, 182, 182, 182, 182, 182, 182, 182, 182, 1577, 1577, 1577, 1577, 1577, - 1577, 1577, 1577, 622, 622, 622, 622, 622, 622, 622, 622, -171, -171, -171, - -171, -171, -171, -171, -171, 202, 202, 202, 202, 202, 202, 202, 202, 287, - 287, 287, 287, 287, 287, 287, 287, 1422, 1422, 1422, 1422, 1422, 1422, 1422, - 1422, 1493, 1493, 1493, 1493, 1493, 1493, 1493, 1493, -1517, -1517, -1517, - -1517, -1517, -1517, -1517, -1517, -359, -359, -359, -359, -359, -359, -359, - -359, -758, -758, -758, -758, -758, -758, -758, -758}; + -1571, -1571, -1571, -1571, -1571, -1571, + /* For Len=4 */ + 1223, 1223, 1223, 1223, 652, 652, 652, 652, -552, -552, -552, -552, 1015, + 1015, 1015, 1015, -1293, -1293, -1293, -1293, 1491, 1491, 1491, 1491, -282, + -282, -282, -282, -1544, -1544, -1544, -1544, 516, 516, 516, 516, -8, -8, + -8, -8, -320, -320, -320, -320, -666, -666, -666, -666, -1618, -1618, -1618, + -1618, -1162, -1162, -1162, -1162, 126, 126, 126, 126, 1469, 1469, 1469, + 1469, -853, -853, -853, -853, -90, -90, -90, -90, -271, -271, -271, -271, + 830, 830, 830, 830, 107, 107, 107, 107, -1421, -1421, -1421, -1421, -247, + -247, -247, -247, -951, -951, -951, -951, -398, -398, -398, -398, 961, 961, + 961, 961, -1508, -1508, -1508, -1508, -725, -725, -725, -725, 448, 448, 448, + 448, -1065, -1065, -1065, -1065, 677, 677, 677, 677, -1275, -1275, -1275, + -1275, + /* + * For ntt Len=2 + * reorder zeta array, (1, 2, 3, 4) -> (3, 1, 4, 2) + * Transpose z[0], z[1], z[2], z[3] + * -> z[3], z[3], z[1], z[1], z[4], z[4], z[2], z[2] + */ + 555, 555, -1103, -1103, 843, 843, 430, 430, 1550, 1550, -1251, -1251, 105, + 105, 871, 871, 177, 177, 422, 422, -235, -235, 587, 587, 1574, 1574, -291, + -291, 1653, 1653, -460, -460, 1159, 1159, -246, -246, -147, -147, 778, 778, + -602, -602, -777, -777, 1119, 1119, 1483, 1483, -872, -872, -1590, -1590, + 349, 349, 644, 644, -156, -156, 418, 418, -75, -75, 329, 329, 603, 603, 817, + 817, 610, 610, 1097, 1097, -1465, -1465, 1322, 1322, 384, 384, -1285, -1285, + 1218, 1218, -1215, -1215, -1335, -1335, -136, -136, -1187, -1187, -874, + -874, -1659, -1659, 220, 220, -1278, -1278, -1185, -1185, 794, 794, -1530, + -1530, -870, -870, -1510, -1510, 478, 478, -854, -854, 996, 996, -108, -108, + 991, 991, -308, -308, 1522, 1522, 958, 958, 1628, 1628, -1460, -1460, + /* + * For intt Len=2, offset IZETA_NTT_OFFSET127 + * reorder zeta array, (1, 2, 3, 4) -> (3, 1, 4, 2) + * Transpose z[0], z[1], z[2], z[3] + * -> z[3], z[3], z[1], z[1], z[4], z[4], z[2], z[2] + */ + -1460, -1460, 1628, 1628, 958, 958, 1522, 1522, -308, -308, 991, 991, -108, + -108, 996, 996, -854, -854, 478, 478, -1510, -1510, -870, -870, -1530, + -1530, 794, 794, -1185, -1185, -1278, -1278, 220, 220, -1659, -1659, -874, + -874, -1187, -1187, -136, -136, -1335, -1335, -1215, -1215, 1218, 1218, + -1285, -1285, 384, 384, 1322, 1322, -1465, -1465, 1097, 1097, 610, 610, 817, + 817, 603, 603, 329, 329, -75, -75, 418, 418, -156, -156, 644, 644, 349, 349, + -1590, -1590, -872, -872, 1483, 1483, 1119, 1119, -777, -777, -602, -602, + 778, 778, -147, -147, -246, -246, 1159, 1159, -460, -460, 1653, 1653, -291, + -291, 1574, 1574, 587, 587, -235, -235, 422, 422, 177, 177, 871, 871, 105, + 105, -1251, -1251, 1550, 1550, 430, 430, 843, 843, -1103, -1103, 555, 555, + /* For intt Len=4 */ + -1275, -1275, -1275, -1275, 677, 677, 677, 677, -1065, -1065, -1065, -1065, + 448, 448, 448, 448, -725, -725, -725, -725, -1508, -1508, -1508, -1508, 961, + 961, 961, 961, -398, -398, -398, -398, -951, -951, -951, -951, -247, -247, + -247, -247, -1421, -1421, -1421, -1421, 107, 107, 107, 107, 830, 830, 830, + 830, -271, -271, -271, -271, -90, -90, -90, -90, -853, -853, -853, -853, + 1469, 1469, 1469, 1469, 126, 126, 126, 126, -1162, -1162, -1162, -1162, + -1618, -1618, -1618, -1618, -666, -666, -666, -666, -320, -320, -320, -320, + -8, -8, -8, -8, 516, 516, 516, 516, -1544, -1544, -1544, -1544, -282, -282, + -282, -282, 1491, 1491, 1491, 1491, -1293, -1293, -1293, -1293, 1015, 1015, + 1015, 1015, -552, -552, -552, -552, 652, 652, 652, 652, 1223, 1223, 1223, + 1223, + /* For intt Len=8 and others */ + -1571, -1571, -1571, -1571, -1571, -1571, -1571, -1571, -205, -205, -205, + -205, -205, -205, -205, -205, 411, 411, 411, 411, 411, 411, 411, 411, -1542, + -1542, -1542, -1542, -1542, -1542, -1542, -1542, 608, 608, 608, 608, 608, + 608, 608, 608, 732, 732, 732, 732, 732, 732, 732, 732, 1017, 1017, 1017, + 1017, 1017, 1017, 1017, 1017, -681, -681, -681, -681, -681, -681, -681, + -681, -130, -130, -130, -130, -130, -130, -130, -130, -1602, -1602, -1602, + -1602, -1602, -1602, -1602, -1602, 1458, 1458, 1458, 1458, 1458, 1458, 1458, + 1458, -829, -829, -829, -829, -829, -829, -829, -829, 383, 383, 383, 383, + 383, 383, 383, 383, 264, 264, 264, 264, 264, 264, 264, 264, -1325, -1325, + -1325, -1325, -1325, -1325, -1325, -1325, 573, 573, 573, 573, 573, 573, 573, + 573, 1468, 1468, 1468, 1468, 1468, 1468, 1468, 1468, -1474, -1474, -1474, + -1474, -1474, -1474, -1474, -1474, -1202, -1202, -1202, -1202, -1202, -1202, + -1202, -1202, 962, 962, 962, 962, 962, 962, 962, 962, 182, 182, 182, 182, + 182, 182, 182, 182, 1577, 1577, 1577, 1577, 1577, 1577, 1577, 1577, 622, + 622, 622, 622, 622, 622, 622, 622, -171, -171, -171, -171, -171, -171, -171, + -171, 202, 202, 202, 202, 202, 202, 202, 202, 287, 287, 287, 287, 287, 287, + 287, 287, 1422, 1422, 1422, 1422, 1422, 1422, 1422, 1422, 1493, 1493, 1493, + 1493, 1493, 1493, 1493, 1493, -1517, -1517, -1517, -1517, -1517, -1517, + -1517, -1517, -359, -359, -359, -359, -359, -359, -359, -359, -758, -758, + -758, -758, -758, -758, -758, -758}; #endif /* MLK_ARITH_BACKEND_PPC64LE_DEFAULT && \ !MLK_CONFIG_MULTILEVEL_NO_SHARED */ diff --git a/dev/ppc64le/src/consts.h b/dev/ppc64le/src/consts.h index d424601ac..b5e66983f 100644 --- a/dev/ppc64le/src/consts.h +++ b/dev/ppc64le/src/consts.h @@ -14,9 +14,7 @@ #define C1441_OFFSET 64 #define C1353_OFFSET 80 #define ZETA_NTT_OFFSET 96 -#define ZETA_NTT_OFFSET64 1104 -#define IZETA_NTT_OFFSET127 1616 -#define IZETA_NTT_OFFSET63 2128 +#define ZETA_INTT_OFFSET 1104 #ifndef __ASSEMBLER__ #define mlk_ppc_qdata MLK_NAMESPACE(ppc_qdata) diff --git a/dev/ppc64le/src/intt_ppc.S b/dev/ppc64le/src/intt_ppc.S index 95bf370b8..5c7b3dba6 100644 --- a/dev/ppc64le/src/intt_ppc.S +++ b/dev/ppc64le/src/intt_ppc.S @@ -36,6 +36,17 @@ #define V_ZETA 10 #define V1441 10 +.macro Compute_4Coeffs + vsubuhm 25, 8, 21 # r[j+len] - t + vsubuhm 26, 12, 22 # r[j+len] - t + vsubuhm 30, 16, 23 # r[j+len] - t + vsubuhm 31, 20, 24 # r[j+len] - t + vadduhm 8, 8, 21 # r[j+len] + t + vadduhm 12, 12, 22 # r[j+len] + t + vadduhm 16, 16, 23 # r[j+len] + t + vadduhm 20, 20, 24 # r[j+len] + t +.endm + .macro Load_4Coeffs start next step mr 9, \start # j add 10, 7, 9 # J + len*2 @@ -63,14 +74,64 @@ xxpermdi 32+23, 32+23, 32+23, 2 xxpermdi 32+24, 32+24, 32+24, 2 - vsubuhm 25, 8, 21 # r[j+len] - t - vsubuhm 26, 12, 22 # r[j+len] - t - vsubuhm 30, 16, 23 # r[j+len] - t - vsubuhm 31, 20, 24 # r[j+len] - t - vadduhm 8, 8, 21 # r[j+len] + t - vadduhm 12, 12, 22 # r[j+len] + t - vadduhm 16, 16, 23 # r[j+len] + t - vadduhm 20, 20, 24 # r[j+len] + t + Compute_4Coeffs +.endm + +# +# Load Coeffients and setup vectors +# aj0, aj1, ajlen2, ajlen3, aj4, aj5, ajlen6, ajlen7 +# aj8, aj9, ajlen10, ajlen11, aj12, aj13, ajlen14, ajlen15 +# +# a[j]= aj0, aj1, aj8, aj9, aj4, aj5, aj12, aj13 +# a[j+len]= ajlen2, ajlen3, ajlen10, ajlen11, ajlen6, ajlen7, ajlen14, ajlen15 +# +.macro Load_L24Coeffs + lxv 32+25, 0(5) # a[j], r[j+len] + lxv 32+26, 16(5) # a[j], r[j+len] + vmrgew 8, 25, 26 + vmrgow 21, 25, 26 + lxv 32+25, 32(5) # a[j], r[j+len] + lxv 32+26, 48(5) # a[j], r[j+len] + vmrgew 12, 25, 26 + vmrgow 22, 25, 26 + lxv 32+25, 64(5) # a[j], r[j+len] + lxv 32+26, 80(5) # a[j], r[j+len] + vmrgew 16, 25, 26 + vmrgow 23, 25, 26 + lxv 32+25, 96(5) # a[j], r[j+len] + lxv 32+26, 112(5) # a[j], r[j+len] + vmrgew 20, 25, 26 + vmrgow 24, 25, 26 +.endm + +# +# Permute +# rj0, rj1, rj2, rj3, rjlen4, rjlen5, rjlen6, rjlen7 +# rj8, rj9, rj10, rj11, rjlen12, rjlen13, rjlen14, rjlen15 +# +# to +# rjlen4 - rjlen7, rjlen12 - rjlen15 +# rj0 - rj4, rj8 - rj11 +# +.macro Load_L44Coeffs + lxv 10, 0(5) # rj0, rj1, rj2, rj3, + # rjlen4, rjlen5, rjlen6, rjlen7 + lxv 11, 16(5) # rj8, rj9, rj10, rj11 + # rjlen12, rjlen13, rjlen14, rjlen15 + xxpermdi 32+8, 11, 10, 0 # rjlen4 - rjlen7, rjlen12 - rjlen15 + xxpermdi 32+21, 11, 10, 3 # rj0 - rj4, rj8 - rj11 + lxv 10, 32(5) + lxv 11, 48(5) + xxpermdi 32+12, 11, 10, 0 + xxpermdi 32+22, 11, 10, 3 + lxv 10, 64(5) + lxv 11, 80(5) + xxpermdi 32+16, 11, 10, 0 + xxpermdi 32+23, 11, 10, 3 + lxv 10, 96(5) + lxv 11, 112(5) + xxpermdi 32+20, 11, 10, 0 + xxpermdi 32+24, 11, 10, 3 .endm .macro BREDUCE_4X _v0 _v1 _v2 _v3 @@ -131,7 +192,7 @@ .endm #----------------------------------- -# MREDUCE_4X(len, start, _vz0, _vz1, _vz2, _vz3) +# MREDUCE_4X(_vz0, _vz1, _vz2, _vz3, _vo0, _vo1, _vo2, _vo3) # .macro MREDUCE_4X _vz0 _vz1 _vz2 _vz3 _vo0 _vo1 _vo2 _vo3 # Modular multification bond by 2^16 * q in abs value @@ -210,34 +271,88 @@ stxv \_vs7, -16(3) .endm -.macro Write_Len2_4C _vs0 _vs1 _vs2 _vs3 - xxmrglw 32+12, \_vs0, 10 - xxmrghw 32+11, \_vs0, 10 - xxpermdi 10, 32+12, 32+11, 3 - xxmrglw 32+16, \_vs1, 11 - xxmrghw 32+15, \_vs1, 11 - xxpermdi 11, 32+16, 32+15, 3 - xxmrglw 32+12, \_vs2, 12 - xxmrghw 32+11, \_vs2, 12 - xxpermdi 12, 32+12, 32+11, 3 - xxmrglw 32+16, \_vs3, 13 - xxmrghw 32+15, \_vs3, 13 - xxpermdi 13, 32+16, 32+15, 3 - stxvd2x 10, 3, 9 - stxvd2x 11, 3, 16 - stxvd2x 12, 3, 18 - stxvd2x 13, 3, 20 +.macro PermWriteL44 + xxlor 32+14, 10, 10 + xxlor 32+19, 11, 11 + xxlor 32+24, 12, 12 + xxlor 32+29, 13, 13 + xxpermdi 32+10, 32+13, 32+14, 3 + xxpermdi 32+11, 32+13, 32+14, 0 + xxpermdi 32+12, 32+18, 32+19, 3 + xxpermdi 32+13, 32+18, 32+19, 0 + xxpermdi 32+14, 32+23, 32+24, 3 + xxpermdi 32+15, 32+23, 32+24, 0 + xxpermdi 32+16, 32+28, 32+29, 3 + xxpermdi 32+17, 32+28, 32+29, 0 + stxv 32+10, 0(5) + stxv 32+11, 16(5) + stxv 32+12, 32(5) + stxv 32+13, 48(5) + stxv 32+14, 64(5) + stxv 32+15, 80(5) + stxv 32+16, 96(5) + stxv 32+17, 112(5) +.endm + +.macro PermWriteL24 + xxlor 32+14, 10, 10 + xxlor 32+19, 11, 11 + xxlor 32+24, 12, 12 + xxlor 32+29, 13, 13 + vmrgew 10, 13, 14 + vmrgow 11, 13, 14 + vmrgew 12, 18, 19 + vmrgow 13, 18, 19 + vmrgew 14, 23, 24 + vmrgow 15, 23, 24 + vmrgew 16, 28, 29 + vmrgow 17, 28, 29 + stxv 32+10, 0(5) + stxv 32+11, 16(5) + stxv 32+12, 32(5) + stxv 32+13, 48(5) + stxv 32+14, 64(5) + stxv 32+15, 80(5) + stxv 32+16, 96(5) + stxv 32+17, 112(5) +.endm + +.macro INTT_REDUCE_L24 + Load_L24Coeffs + Compute_4Coeffs + BREDUCE_4X 4, 9, 13, 17 + xxlor 10, 32+4, 32+4 + xxlor 11, 32+9, 32+9 + xxlor 12, 32+13, 32+13 + xxlor 13, 32+17, 32+17 + Set_mont_consts + Load_next_4zetas + MREDUCE_4X V_Z0, V_Z1, V_Z2, V_Z3, 13, 18, 23, 28 + PermWriteL24 +.endm + +.macro INTT_REDUCE_L44 + Load_L44Coeffs + Compute_4Coeffs + BREDUCE_4X 4, 9, 13, 17 + xxlor 10, 32+4, 32+4 + xxlor 11, 32+9, 32+9 + xxlor 12, 32+13, 32+13 + xxlor 13, 32+17, 32+17 + Set_mont_consts + Load_next_4zetas + MREDUCE_4X V_Z0, V_Z1, V_Z2, V_Z3, 13, 18, 23, 28 + PermWriteL44 .endm -.macro Write_Len4_4C _vs0 _vs1 _vs2 _vs3 - xxpermdi 10, 10, \_vs0, 3 - xxpermdi 11, 11, \_vs1, 3 - xxpermdi 12, 12, \_vs2, 3 - xxpermdi 13, 13, \_vs3, 3 - stxvd2x 10, 3, 9 - stxvd2x 11, 3, 16 - stxvd2x 12, 3, 18 - stxvd2x 13, 3, 20 +.macro INTT_REDUCE_4X start next step + Load_4Coeffs \start, \next, \step + BREDUCE_4X 4, 9, 13, 17 + Write_B4C 32+4, 32+9, 32+13, 32+17 + Set_mont_consts + Load_next_4zetas + MREDUCE_4X V_Z0, V_Z1, V_Z2, V_Z3, 13, 18, 23, 28 + Write_M4C 32+13, 32+18, 32+23, 32+28 .endm # intt @@ -301,164 +416,85 @@ MLK_ASM_FN_SYMBOL(intt_ppc) vslw 9, 9, 10 xxlor 7, 32+9, 32+9 # V_25 syore at vs7 + # + # Montgomery reduce loops with constant 1441 + # + addi 14, 4, C1441_OFFSET + lvx V1441, 0, 14 + li 15, 4 # loops + mtctr 15 + + Set_mont_consts +intt_ppc__Loopf: + Reload_4coeffs + MREDUCE_4X V1441, V1441, V1441, V1441, 6, 7, 8, 9 + Reload_4coeffs + MREDUCE_4X V1441, V1441, V1441, V1441, 13, 18, 23, 28 + MWrite_8X 32+6, 32+7, 32+8, 32+9, 32+13, 32+18, 32+23, 32+28 + bdnz intt_ppc__Loopf + + addi 3, 3, -512 + .align 4 # # 1. len = 2, start = 0, 4, 8, 12,...244, 248, 252 # Update zetas vectors, each vector has 2 zetas - addi 14, 4, IZETA_NTT_OFFSET127 + addi 14, 4, ZETA_INTT_OFFSET li 7, 4 li 15, 4 mtctr 15 - li 5, 0 + mr 5, 3 intt_ppc__Loop2: - Load_4Coeffs 5, 16, 16 - BREDUCE_4X 4, 9, 13, 17 - xxlor 10, 32+4, 32+4 - xxlor 11, 32+9, 32+9 - xxlor 12, 32+13, 32+13 - xxlor 13, 32+17, 32+17 - Set_mont_consts - Load_next_4zetas - MREDUCE_4X V_Z0, V_Z1, V_Z2, V_Z3, 13, 18, 23, 28 - Write_Len2_4C 32+13, 32+18, 32+23, 32+28 - - addi 5, 5, 64 - - Load_4Coeffs 5, 16, 16 - BREDUCE_4X 4, 9, 13, 17 - xxlor 10, 32+4, 32+4 - xxlor 11, 32+9, 32+9 - xxlor 12, 32+13, 32+13 - xxlor 13, 32+17, 32+17 - Set_mont_consts - Load_next_4zetas - MREDUCE_4X V_Z0, V_Z1, V_Z2, V_Z3, 13, 18, 23, 28 - Write_Len2_4C 32+13, 32+18, 32+23, 32+28 - addi 5, 5, 64 + INTT_REDUCE_L24 + addi 5, 5, 128 bdnz intt_ppc__Loop2 .align 4 # # 2. len = 4, start = 0, 8, 16, 24,...232, 240, 248 - addi 14, 4, IZETA_NTT_OFFSET63 - li 5, 0 + mr 5, 3 li 7, 8 li 15, 4 # loops mtctr 15 intt_ppc__Loop4: - Load_4Coeffs 5, 16, 16 - BREDUCE_4X 4, 9, 13, 17 - xxlor 10, 32+4, 32+4 - xxlor 11, 32+9, 32+9 - xxlor 12, 32+13, 32+13 - xxlor 13, 32+17, 32+17 - Set_mont_consts - Load_next_4zetas - MREDUCE_4X V_Z0, V_Z1, V_Z2, V_Z3, 13, 18, 23, 28 - Write_Len4_4C 32+13, 32+18, 32+23, 32+28 - addi 5, 5, 64 - - Load_4Coeffs 5, 16, 16 - BREDUCE_4X 4, 9, 13, 17 - xxlor 10, 32+4, 32+4 - xxlor 11, 32+9, 32+9 - xxlor 12, 32+13, 32+13 - xxlor 13, 32+17, 32+17 - Set_mont_consts - Load_next_4zetas - MREDUCE_4X V_Z0, V_Z1, V_Z2, V_Z3, 13, 18, 23, 28 - Write_Len4_4C 32+13, 32+18, 32+23, 32+28 - addi 5, 5, 64 + INTT_REDUCE_L44 + addi 5, 5, 128 bdnz intt_ppc__Loop4 .align 4 # 3. len = 8, start = 0, 16, 32, 48,...208, 224, 240 - #addi 14, 14, 512 li 7, 16 li 5, 0 + li 15, 4 # loops + mtctr 15 - Load_4Coeffs 5, 32, 32 - BREDUCE_4X 4, 9, 13, 17 - Write_B4C 32+4, 32+9, 32+13, 32+17 - Set_mont_consts - Load_next_4zetas - MREDUCE_4X V_Z0, V_Z1, V_Z2, V_Z3, 13, 18, 23, 28 - Write_M4C 32+13, 32+18, 32+23, 32+28 - li 5, 128 - - Load_4Coeffs 5, 32, 32 - BREDUCE_4X 4, 9, 13, 17 - Write_B4C 32+4, 32+9, 32+13, 32+17 - Set_mont_consts - Load_next_4zetas - MREDUCE_4X V_Z0, V_Z1, V_Z2, V_Z3, 13, 18, 23, 28 - Write_M4C 32+13, 32+18, 32+23, 32+28 - li 5, 256 - - Load_4Coeffs 5, 32, 32 - BREDUCE_4X 4, 9, 13, 17 - Write_B4C 32+4, 32+9, 32+13, 32+17 - Set_mont_consts - Load_next_4zetas - MREDUCE_4X V_Z0, V_Z1, V_Z2, V_Z3, 13, 18, 23, 28 - Write_M4C 32+13, 32+18, 32+23, 32+28 - li 5, 384 - - Load_4Coeffs 5, 32, 32 - BREDUCE_4X 4, 9, 13, 17 - Write_B4C 32+4, 32+9, 32+13, 32+17 - Set_mont_consts - Load_next_4zetas - MREDUCE_4X V_Z0, V_Z1, V_Z2, V_Z3, 13, 18, 23, 28 - Write_M4C 32+13, 32+18, 32+23, 32+28 +intt_ppc__Loop8: + INTT_REDUCE_4X 5, 32, 32 + addi 5, 5, 128 + bdnz intt_ppc__Loop8 .align 4 # # 4. len = 16, start = 0, 32, 64,,...160, 192, 224 - #addi 14, 14, 768 li 5, 0 li 7, 32 - Load_4Coeffs 5, 64, 64 - BREDUCE_4X 4, 9, 13, 17 - Write_B4C 32+4, 32+9, 32+13, 32+17 - Set_mont_consts - Load_next_4zetas - MREDUCE_4X V_Z0, V_Z1, V_Z2, V_Z3, 13, 18, 23, 28 - Write_M4C 32+13, 32+18, 32+23, 32+28 + INTT_REDUCE_4X 5, 64, 64 + li 5, 16 - Load_4Coeffs 5, 64, 64 - BREDUCE_4X 4, 9, 13, 17 - Write_B4C 32+4, 32+9, 32+13, 32+17 - Set_mont_consts addi 14, 14, -64 - Load_next_4zetas - MREDUCE_4X V_Z0, V_Z1, V_Z2, V_Z3, 13, 18, 23, 28 - Write_M4C 32+13, 32+18, 32+23, 32+28 + INTT_REDUCE_4X 5, 64, 64 li 5, 256 - Load_4Coeffs 5, 64, 64 - BREDUCE_4X 4, 9, 13, 17 - Write_B4C 32+4, 32+9, 32+13, 32+17 - Set_mont_consts - Load_next_4zetas - MREDUCE_4X V_Z0, V_Z1, V_Z2, V_Z3, 13, 18, 23, 28 - Write_M4C 32+13, 32+18, 32+23, 32+28 + INTT_REDUCE_4X 5, 64, 64 li 5, 272 - Load_4Coeffs 5, 64, 64 - BREDUCE_4X 4, 9, 13, 17 - Write_B4C 32+4, 32+9, 32+13, 32+17 - Set_mont_consts addi 14, 14, -64 - Load_next_4zetas - MREDUCE_4X V_Z0, V_Z1, V_Z2, V_Z3, 13, 18, 23, 28 - Write_M4C 32+13, 32+18, 32+23, 32+28 + INTT_REDUCE_4X 5, 64, 64 .align 4 # # 5. len = 32, start = 0, 64, 128, 192 - #addi 14, 14, 896 li 5, 0 li 7, 64 @@ -504,7 +540,6 @@ intt_ppc__Loop4: .align 4 # # 6. len = 64, start = 0, 128 - #addi 14, 14, 960 li 5, 0 li 7, 128 Load_4Coeffs 5, 16, 16 @@ -547,7 +582,6 @@ intt_ppc__Loop4: .align 4 # 7. len = 128, start = 0 # - #addi 14, 14, 992 li 5, 0 # start li 7, 256 # len * 2 @@ -587,37 +621,6 @@ intt_ppc__Loop4: MREDUCE_4X V_ZETA, V_ZETA, V_ZETA, V_ZETA, 13, 18, 23, 28 Write_M4C 32+13, 32+18, 32+23, 32+28 -.align 4 - # - # Montgomery reduce loops with constant 1441 - # - addi 14, 4, C1441_OFFSET - lvx V1441, 0, 14 - - Reload_4coeffs - MREDUCE_4X V1441, V1441, V1441, V1441, 6, 7, 8, 9 - Reload_4coeffs - MREDUCE_4X V1441, V1441, V1441, V1441, 13, 18, 23, 28 - MWrite_8X 32+6, 32+7, 32+8, 32+9, 32+13, 32+18, 32+23, 32+28 - - Reload_4coeffs - MREDUCE_4X V1441, V1441, V1441, V1441, 6, 7, 8, 9 - Reload_4coeffs - MREDUCE_4X V1441, V1441, V1441, V1441, 13, 18, 23, 28 - MWrite_8X 32+6, 32+7, 32+8, 32+9, 32+13, 32+18, 32+23, 32+28 - - Reload_4coeffs - MREDUCE_4X V1441, V1441, V1441, V1441, 6, 7, 8, 9 - Reload_4coeffs - MREDUCE_4X V1441, V1441, V1441, V1441, 13, 18, 23, 28 - MWrite_8X 32+6, 32+7, 32+8, 32+9, 32+13, 32+18, 32+23, 32+28 - - Reload_4coeffs - MREDUCE_4X V1441, V1441, V1441, V1441, 6, 7, 8, 9 - Reload_4coeffs - MREDUCE_4X V1441, V1441, V1441, V1441, 13, 18, 23, 28 - MWrite_8X 32+6, 32+7, 32+8, 32+9, 32+13, 32+18, 32+23, 32+28 - lxv 32+20, 128(1) lxv 32+21, 144(1) lxv 32+22, 160(1) diff --git a/dev/ppc64le/src/ntt_ppc.S b/dev/ppc64le/src/ntt_ppc.S index 0c98581c5..435e5bb52 100644 --- a/dev/ppc64le/src/ntt_ppc.S +++ b/dev/ppc64le/src/ntt_ppc.S @@ -28,15 +28,7 @@ .machine "any" .text -# -# montgomery_reduce -# t = a * QINV -# t = (a - (int32_t)t*_MLKEM_Q) >> 16 -# -#----------------------------------- -# MREDUCE_4X(start, _vz0, _vz1, _vz2, _vz3) -# -.macro MREDUCE_4X start next step _vz0 _vz1 _vz2 _vz3 +.macro Load_4Coeffs start next step mr 9, \start add 10, 7, 9 # J + len*2 addi 16, 9, \next @@ -53,7 +45,74 @@ xxpermdi 32+18, 32+18, 32+18, 2 xxpermdi 32+23, 32+23, 32+23, 2 xxpermdi 32+28, 32+28, 32+28, 2 +.endm +# +# Load Coeffients and setup vectors +# aj0, aj1, ajlen2, ajlen3, aj4, aj5, ajlen6, ajlen7 +# aj8, aj9, ajlen10, ajlen11, aj12, aj13, ajlen14, ajlen15 +# +# a[j]= aj0, aj1, aj8, aj9, aj4, aj5, aj12, aj13 +# a[j+len]= ajlen2, ajlen3, ajlen10, ajlen11, ajlen6, ajlen7, ajlen14, ajlen15 +# +.macro Load_L24Coeffs + lxv 32+25, 0(5) # a[j], r[j+len] + lxv 32+26, 16(5) # a[j], r[j+len] + vmrgew 13, 25, 26 + vmrgow 12, 25, 26 + lxv 32+25, 32(5) # a[j], r[j+len] + lxv 32+26, 48(5) # a[j], r[j+len] + vmrgew 18, 25, 26 + vmrgow 17, 25, 26 + lxv 32+25, 64(5) # a[j], r[j+len] + lxv 32+26, 80(5) # a[j], r[j+len] + vmrgew 23, 25, 26 + vmrgow 22, 25, 26 + lxv 32+25, 96(5) # a[j], r[j+len] + lxv 32+26, 112(5) # a[j], r[j+len] + vmrgew 28, 25, 26 + vmrgow 27, 25, 26 +.endm + +# +# Permute +# rj0, rj1, rj2, rj3, rjlen4, rjlen5, rjlen6, rjlen7 +# rj8, rj9, rj10, rj11, rjlen12, rjlen13, rjlen14, rjlen15 +# +# to +# rjlen4 - rjlen7, rjlen12 - rjlen15 +# rj0 - rj4, rj8 - rj11 +# +.macro Load_L44Coeffs + lxv 1, 0(5) # rj0, rj1, rj2, rj3, + # rjlen4, rjlen5, rjlen6, rjlen7 + lxv 2, 16(5) # rj8, rj9, rj10, rj11 + # rjlen12, rjlen13, rjlen14, rjlen15 + xxpermdi 32+13, 2, 1, 0 # rjlen4 - rjlen7, rjlen12 - rjlen15 + xxpermdi 32+12, 2, 1, 3 # rj0 - rj4, rj8 - rj11 + lxv 3, 32(5) + lxv 4, 48(5) + xxpermdi 32+18, 4, 3, 0 + xxpermdi 32+17, 4, 3, 3 + lxv 1, 64(5) + lxv 2, 80(5) + xxpermdi 32+23, 2, 1, 0 + xxpermdi 32+22, 2, 1, 3 + lxv 3, 96(5) + lxv 4, 112(5) + xxpermdi 32+28, 4, 3, 0 + xxpermdi 32+27, 4, 3, 3 +.endm + +# +# montgomery_reduce +# t = a * QINV +# t = (a - (int32_t)t*_MLKEM_Q) >> 16 +# +#----------------------------------- +# MREDUCE_4X(_vz0, _vz1, _vz2, _vz3) +# +.macro MREDUCE_4X _vz0 _vz1 _vz2 _vz3 # fqmul = zeta * coefficient # Modular multification bond by 2^16 * q in abs value vmladduhm 15, 13, \_vz0, 3 @@ -82,6 +141,9 @@ vsrah 23, 25, 4 # >> 1 vsrah 28, 30, 4 # >> 1 +.endm + +.macro Load_4Aj lxvd2x 32+12, 3, 9 # r[j] lxvd2x 32+17, 3, 16 # r[j] lxvd2x 32+22, 3, 18 # r[j] @@ -90,7 +152,9 @@ xxpermdi 32+17, 32+17, 32+17, 2 xxpermdi 32+22, 32+22, 32+22, 2 xxpermdi 32+27, 32+27, 32+27, 2 +.endm +.macro Compute_4Coeffs # Since the result of the Montgomery multiplication is bounded # by q in absolute value. # Finally to complete the final update of the results with add/sub @@ -104,6 +168,13 @@ vadduhm 30, 28, 27 # r + t .endm +.macro NTT_MREDUCE_4X start next step _vz0 _vz1 _vz2 _vz3 + Load_4Coeffs \start, \next, \step + MREDUCE_4x \_vz0, \_vz1, \_vz2, \_vz3 + Load_4Aj + Compute_4Coeffs +.endm + .macro Write_One stxvx 32+15, 3, 9 stxvx 32+16, 3, 10 @@ -115,35 +186,44 @@ stxvx 32+31, 3, 21 .endm -.macro Write_Two - xxpermdi 32+17, 32+16, 32+15, 3 - xxpermdi 32+22, 32+21, 32+20, 3 - xxpermdi 32+27, 32+26, 32+25, 3 - xxpermdi 32+29, 32+31, 32+30, 3 - - stxvx 32+17, 3, 9 - stxvx 32+22, 3, 16 - stxvx 32+27, 3, 18 - stxvx 32+29, 3, 20 +.macro PermWriteL44 + Compute_4Coeffs + xxpermdi 0, 32+16, 32+15, 3 + xxpermdi 1, 32+16, 32+15, 0 + xxpermdi 2, 32+21, 32+20, 3 + xxpermdi 3, 32+21, 32+20, 0 + xxpermdi 4, 32+26, 32+25, 3 + xxpermdi 5, 32+26, 32+25, 0 + xxpermdi 6, 32+31, 32+30, 3 + xxpermdi 7, 32+31, 32+30, 0 + stxv 0, 0(5) + stxv 1, 16(5) + stxv 2, 32(5) + stxv 3, 48(5) + stxv 4, 64(5) + stxv 5, 80(5) + stxv 6, 96(5) + stxv 7, 112(5) .endm -.macro Write_Three - xxmrglw 32+14, 32+16, 32+15 - xxmrghw 32+13, 32+16, 32+15 - xxpermdi 32+17, 32+13, 32+14, 3 - xxmrglw 32+19, 32+21, 32+20 - xxmrghw 32+18, 32+21, 32+20 - xxpermdi 32+22, 32+18, 32+19, 3 - xxmrglw 32+14, 32+26, 32+25 - xxmrghw 32+13, 32+26, 32+25 - xxpermdi 32+27, 32+13, 32+14, 3 - xxmrglw 32+24, 32+31, 32+30 - xxmrghw 32+23, 32+31, 32+30 - xxpermdi 32+29, 32+23, 32+24, 3 - stxvx 32+17, 3, 9 - stxvx 32+22, 3, 16 - stxvx 32+27, 3, 18 - stxvx 32+29, 3, 20 +.macro PermWriteL24 + Compute_4Coeffs + vmrgew 10, 16, 15 + vmrgow 11, 16, 15 + vmrgew 12, 21, 20 + vmrgow 13, 21, 20 + vmrgew 14, 26, 25 + vmrgow 15, 26, 25 + vmrgew 16, 31, 30 + vmrgow 17, 31, 30 + stxv 32+10, 0(5) + stxv 32+11, 16(5) + stxv 32+12, 32(5) + stxv 32+13, 48(5) + stxv 32+14, 64(5) + stxv 32+15, 80(5) + stxv 32+16, 96(5) + stxv 32+17, 112(5) .endm .macro Load_next_4zetas @@ -207,16 +287,16 @@ MLK_ASM_FN_SYMBOL(ntt_ppc) lvx V_ZETA, 0, 14 addi 14, 14, 16 - MREDUCE_4X 5, 16, 16, V_ZETA, V_ZETA, V_ZETA, V_ZETA + NTT_MREDUCE_4X 5, 16, 16, V_ZETA, V_ZETA, V_ZETA, V_ZETA Write_One li 5, 64 - MREDUCE_4X 5, 16, 16, V_ZETA, V_ZETA, V_ZETA, V_ZETA + NTT_MREDUCE_4X 5, 16, 16, V_ZETA, V_ZETA, V_ZETA, V_ZETA Write_One li 5, 128 - MREDUCE_4X 5, 16, 16, V_ZETA, V_ZETA, V_ZETA, V_ZETA + NTT_MREDUCE_4X 5, 16, 16, V_ZETA, V_ZETA, V_ZETA, V_ZETA Write_One li 5, 192 - MREDUCE_4X 5, 16, 16, V_ZETA, V_ZETA, V_ZETA, V_ZETA + NTT_MREDUCE_4X 5, 16, 16, V_ZETA, V_ZETA, V_ZETA, V_ZETA Write_One .align 4 @@ -227,19 +307,19 @@ MLK_ASM_FN_SYMBOL(ntt_ppc) li 7, 128 lvx V_ZETA, 0, 14 addi 14, 14, 16 - MREDUCE_4X 5, 16, 16, V_ZETA, V_ZETA, V_ZETA, V_ZETA + NTT_MREDUCE_4X 5, 16, 16, V_ZETA, V_ZETA, V_ZETA, V_ZETA Write_One li 5, 64 - MREDUCE_4X 5, 16, 16, V_ZETA, V_ZETA, V_ZETA, V_ZETA + NTT_MREDUCE_4X 5, 16, 16, V_ZETA, V_ZETA, V_ZETA, V_ZETA Write_One li 5, 256 lvx V_ZETA, 0, 14 addi 14, 14, 16 - MREDUCE_4X 5, 16, 16, V_ZETA, V_ZETA, V_ZETA, V_ZETA + NTT_MREDUCE_4X 5, 16, 16, V_ZETA, V_ZETA, V_ZETA, V_ZETA Write_One li 5, 320 - MREDUCE_4X 5, 16, 16, V_ZETA, V_ZETA, V_ZETA, V_ZETA + NTT_MREDUCE_4X 5, 16, 16, V_ZETA, V_ZETA, V_ZETA, V_ZETA Write_One .align 4 @@ -250,28 +330,25 @@ MLK_ASM_FN_SYMBOL(ntt_ppc) li 7, 64 lvx V_ZETA, 0, 14 addi 14, 14, 16 - MREDUCE_4X 5, 16, 16, V_ZETA, V_ZETA, V_ZETA, V_ZETA + NTT_MREDUCE_4X 5, 16, 16, V_ZETA, V_ZETA, V_ZETA, V_ZETA Write_One - #li 5, 64 li 5, 128 lvx V_ZETA, 0, 14 addi 14, 14, 16 - MREDUCE_4X 5, 16, 16, V_ZETA, V_ZETA, V_ZETA, V_ZETA + NTT_MREDUCE_4X 5, 16, 16, V_ZETA, V_ZETA, V_ZETA, V_ZETA Write_One - #li 5, 128 li 5, 256 lvx V_ZETA, 0, 14 addi 14, 14, 16 - MREDUCE_4X 5, 16, 16, V_ZETA, V_ZETA, V_ZETA, V_ZETA + NTT_MREDUCE_4X 5, 16, 16, V_ZETA, V_ZETA, V_ZETA, V_ZETA Write_One - #li 5, 192 li 5, 384 lvx V_ZETA, 0, 14 addi 14, 14, 16 - MREDUCE_4X 5, 16, 16, V_ZETA, V_ZETA, V_ZETA, V_ZETA + NTT_MREDUCE_4X 5, 16, 16, V_ZETA, V_ZETA, V_ZETA, V_ZETA Write_One .align 4 @@ -281,18 +358,18 @@ MLK_ASM_FN_SYMBOL(ntt_ppc) li 5, 0 li 7, 32 Load_next_4zetas - MREDUCE_4X 5, 64, 64, V_Z0, V_Z1, V_Z2, V_Z3 + NTT_MREDUCE_4X 5, 64, 64, V_Z0, V_Z1, V_Z2, V_Z3 Write_One li 5, 16 - MREDUCE_4X 5, 64, 64, V_Z0, V_Z1, V_Z2, V_Z3 + NTT_MREDUCE_4X 5, 64, 64, V_Z0, V_Z1, V_Z2, V_Z3 Write_One Load_next_4zetas li 5, 256 - MREDUCE_4X 5, 64, 64, V_Z0, V_Z1, V_Z2, V_Z3 + NTT_MREDUCE_4X 5, 64, 64, V_Z0, V_Z1, V_Z2, V_Z3 Write_One li 5, 272 - MREDUCE_4X 5, 64, 64, V_Z0, V_Z1, V_Z2, V_Z3 + NTT_MREDUCE_4X 5, 64, 64, V_Z0, V_Z1, V_Z2, V_Z3 Write_One .align 4 @@ -302,22 +379,22 @@ MLK_ASM_FN_SYMBOL(ntt_ppc) li 5, 0 li 7, 16 Load_next_4zetas - MREDUCE_4X 5, 32, 32, V_Z0, V_Z1, V_Z2, V_Z3 + NTT_MREDUCE_4X 5, 32, 32, V_Z0, V_Z1, V_Z2, V_Z3 Write_One li 5, 128 Load_next_4zetas - MREDUCE_4X 5, 32, 32, V_Z0, V_Z1, V_Z2, V_Z3 + NTT_MREDUCE_4X 5, 32, 32, V_Z0, V_Z1, V_Z2, V_Z3 Write_One li 5, 256 Load_next_4zetas - MREDUCE_4X 5, 32, 32, V_Z0, V_Z1, V_Z2, V_Z3 + NTT_MREDUCE_4X 5, 32, 32, V_Z0, V_Z1, V_Z2, V_Z3 Write_One li 5, 384 Load_next_4zetas - MREDUCE_4X 5, 32, 32, V_Z0, V_Z1, V_Z2, V_Z3 + NTT_MREDUCE_4X 5, 32, 32, V_Z0, V_Z1, V_Z2, V_Z3 Write_One # @@ -325,19 +402,15 @@ MLK_ASM_FN_SYMBOL(ntt_ppc) # k += 32 li 15, 4 # loops mtctr 15 - li 5, 0 + mr 5, 3 li 7, 8 .align 4 ntt_ppc__Len4: Load_next_4zetas - MREDUCE_4X 5, 16, 16, V_Z0, V_Z1, V_Z2, V_Z3 - Write_Two - addi 5, 5, 64 - - Load_next_4zetas - MREDUCE_4X 5, 16, 16, V_Z0, V_Z1, V_Z2, V_Z3 - Write_Two - addi 5, 5, 64 + Load_L44Coeffs + MREDUCE_4X V_Z0, V_Z1, V_Z2, V_Z3 + PermWriteL44 + addi 5, 5, 128 bdnz ntt_ppc__Len4 @@ -346,23 +419,17 @@ ntt_ppc__Len4: # k += 64 # Update zetas vectors, each vector has 2 zetas - addi 14, 4, ZETA_NTT_OFFSET64 - li 15, 4 mtctr 15 - li 5, 0 + mr 5, 3 li 7, 4 .align 4 ntt_ppc__Len2: Load_next_4zetas - MREDUCE_4X 5, 16, 16, V_Z0, V_Z1, V_Z2, V_Z3 - Write_Three - addi 5, 5, 64 - - Load_next_4zetas - MREDUCE_4X 5, 16, 16, V_Z0, V_Z1, V_Z2, V_Z3 - Write_Three - addi 5, 5, 64 + Load_L24Coeffs + MREDUCE_4X V_Z0, V_Z1, V_Z2, V_Z3 + PermWriteL24 + addi 5, 5, 128 bdnz ntt_ppc__Len2 diff --git a/mlkem/src/native/ppc64le/src/consts.c b/mlkem/src/native/ppc64le/src/consts.c index 4c2fbdf61..fa0f7097f 100644 --- a/mlkem/src/native/ppc64le/src/consts.c +++ b/mlkem/src/native/ppc64le/src/consts.c @@ -3,6 +3,11 @@ * SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT */ +#include +#include +#include +#include + #include "../../../common.h" #if defined(MLK_ARITH_BACKEND_PPC64LE_DEFAULT) && \ @@ -10,7 +15,7 @@ #include "consts.h" -MLK_ALIGN const int16_t mlk_ppc_qdata[1568] = { +MLK_ALIGN const int16_t mlk_ppc_qdata[1072] = { /* -Q */ -3329, -3329, -3329, -3329, -3329, -3329, -3329, -3329, /* QINV */ @@ -44,112 +49,84 @@ MLK_ALIGN const int16_t mlk_ppc_qdata[1568] = { 732, 732, 732, 732, 732, 732, 608, 608, 608, 608, 608, 608, 608, 608, -1542, -1542, -1542, -1542, -1542, -1542, -1542, -1542, 411, 411, 411, 411, 411, 411, 411, 411, -205, -205, -205, -205, -205, -205, -205, -205, -1571, -1571, - -1571, -1571, -1571, -1571, -1571, -1571, 1223, 1223, 1223, 1223, 1223, - 1223, 1223, 1223, 652, 652, 652, 652, 652, 652, 652, 652, -552, -552, -552, - -552, -552, -552, -552, -552, 1015, 1015, 1015, 1015, 1015, 1015, 1015, - 1015, -1293, -1293, -1293, -1293, -1293, -1293, -1293, -1293, 1491, 1491, - 1491, 1491, 1491, 1491, 1491, 1491, -282, -282, -282, -282, -282, -282, - -282, -282, -1544, -1544, -1544, -1544, -1544, -1544, -1544, -1544, 516, - 516, 516, 516, 516, 516, 516, 516, -8, -8, -8, -8, -8, -8, -8, -8, -320, - -320, -320, -320, -320, -320, -320, -320, -666, -666, -666, -666, -666, - -666, -666, -666, -1618, -1618, -1618, -1618, -1618, -1618, -1618, -1618, - -1162, -1162, -1162, -1162, -1162, -1162, -1162, -1162, 126, 126, 126, 126, - 126, 126, 126, 126, 1469, 1469, 1469, 1469, 1469, 1469, 1469, 1469, -853, - -853, -853, -853, -853, -853, -853, -853, -90, -90, -90, -90, -90, -90, -90, - -90, -271, -271, -271, -271, -271, -271, -271, -271, 830, 830, 830, 830, - 830, 830, 830, 830, 107, 107, 107, 107, 107, 107, 107, 107, -1421, -1421, - -1421, -1421, -1421, -1421, -1421, -1421, -247, -247, -247, -247, -247, - -247, -247, -247, -951, -951, -951, -951, -951, -951, -951, -951, -398, - -398, -398, -398, -398, -398, -398, -398, 961, 961, 961, 961, 961, 961, 961, - 961, -1508, -1508, -1508, -1508, -1508, -1508, -1508, -1508, -725, -725, - -725, -725, -725, -725, -725, -725, 448, 448, 448, 448, 448, 448, 448, 448, - -1065, -1065, -1065, -1065, -1065, -1065, -1065, -1065, 677, 677, 677, 677, - 677, 677, 677, 677, -1275, -1275, -1275, -1275, -1275, -1275, -1275, -1275, - /* For intt Len=2, offset IZETA_NTT_OFFSET64 */ - -1103, -1103, -1103, -1103, 430, 430, 430, 430, 555, 555, 555, 555, 843, - 843, 843, 843, -1251, -1251, -1251, -1251, 871, 871, 871, 871, 1550, 1550, - 1550, 1550, 105, 105, 105, 105, 422, 422, 422, 422, 587, 587, 587, 587, 177, - 177, 177, 177, -235, -235, -235, -235, -291, -291, -291, -291, -460, -460, - -460, -460, 1574, 1574, 1574, 1574, 1653, 1653, 1653, 1653, -246, -246, - -246, -246, 778, 778, 778, 778, 1159, 1159, 1159, 1159, -147, -147, -147, - -147, -777, -777, -777, -777, 1483, 1483, 1483, 1483, -602, -602, -602, - -602, 1119, 1119, 1119, 1119, -1590, -1590, -1590, -1590, 644, 644, 644, - 644, -872, -872, -872, -872, 349, 349, 349, 349, 418, 418, 418, 418, 329, - 329, 329, 329, -156, -156, -156, -156, -75, -75, -75, -75, 817, 817, 817, - 817, 1097, 1097, 1097, 1097, 603, 603, 603, 603, 610, 610, 610, 610, 1322, - 1322, 1322, 1322, -1285, -1285, -1285, -1285, -1465, -1465, -1465, -1465, - 384, 384, 384, 384, -1215, -1215, -1215, -1215, -136, -136, -136, -136, - 1218, 1218, 1218, 1218, -1335, -1335, -1335, -1335, -874, -874, -874, -874, - 220, 220, 220, 220, -1187, -1187, -1187, -1187, -1659, -1659, -1659, -1659, - -1185, -1185, -1185, -1185, -1530, -1530, -1530, -1530, -1278, -1278, -1278, - -1278, 794, 794, 794, 794, -1510, -1510, -1510, -1510, -854, -854, -854, - -854, -870, -870, -870, -870, 478, 478, 478, 478, -108, -108, -108, -108, - -308, -308, -308, -308, 996, 996, 996, 996, 991, 991, 991, 991, 958, 958, - 958, 958, -1460, -1460, -1460, -1460, 1522, 1522, 1522, 1522, 1628, 1628, - 1628, 1628, - /* For intt Len=2, offset IZETA_NTT_OFFSET127 */ - 1628, 1628, 1628, 1628, 1522, 1522, 1522, 1522, -1460, -1460, -1460, -1460, - 958, 958, 958, 958, 991, 991, 991, 991, 996, 996, 996, 996, -308, -308, - -308, -308, -108, -108, -108, -108, 478, 478, 478, 478, -870, -870, -870, - -870, -854, -854, -854, -854, -1510, -1510, -1510, -1510, 794, 794, 794, - 794, -1278, -1278, -1278, -1278, -1530, -1530, -1530, -1530, -1185, -1185, - -1185, -1185, -1659, -1659, -1659, -1659, -1187, -1187, -1187, -1187, 220, - 220, 220, 220, -874, -874, -874, -874, -1335, -1335, -1335, -1335, 1218, - 1218, 1218, 1218, -136, -136, -136, -136, -1215, -1215, -1215, -1215, 384, - 384, 384, 384, -1465, -1465, -1465, -1465, -1285, -1285, -1285, -1285, 1322, - 1322, 1322, 1322, 610, 610, 610, 610, 603, 603, 603, 603, 1097, 1097, 1097, - 1097, 817, 817, 817, 817, -75, -75, -75, -75, -156, -156, -156, -156, 329, - 329, 329, 329, 418, 418, 418, 418, 349, 349, 349, 349, -872, -872, -872, - -872, 644, 644, 644, 644, -1590, -1590, -1590, -1590, 1119, 1119, 1119, - 1119, -602, -602, -602, -602, 1483, 1483, 1483, 1483, -777, -777, -777, - -777, -147, -147, -147, -147, 1159, 1159, 1159, 1159, 778, 778, 778, 778, - -246, -246, -246, -246, 1653, 1653, 1653, 1653, 1574, 1574, 1574, 1574, - -460, -460, -460, -460, -291, -291, -291, -291, -235, -235, -235, -235, 177, - 177, 177, 177, 587, 587, 587, 587, 422, 422, 422, 422, 105, 105, 105, 105, - 1550, 1550, 1550, 1550, 871, 871, 871, 871, -1251, -1251, -1251, -1251, 843, - 843, 843, 843, 555, 555, 555, 555, 430, 430, 430, 430, -1103, -1103, -1103, - -1103, - /* For intt Len=4 and others, offset IZETA_NTT_OFFSET63 */ - -1275, -1275, -1275, -1275, -1275, -1275, -1275, -1275, 677, 677, 677, 677, - 677, 677, 677, 677, -1065, -1065, -1065, -1065, -1065, -1065, -1065, -1065, - 448, 448, 448, 448, 448, 448, 448, 448, -725, -725, -725, -725, -725, -725, - -725, -725, -1508, -1508, -1508, -1508, -1508, -1508, -1508, -1508, 961, - 961, 961, 961, 961, 961, 961, 961, -398, -398, -398, -398, -398, -398, -398, - -398, -951, -951, -951, -951, -951, -951, -951, -951, -247, -247, -247, - -247, -247, -247, -247, -247, -1421, -1421, -1421, -1421, -1421, -1421, - -1421, -1421, 107, 107, 107, 107, 107, 107, 107, 107, 830, 830, 830, 830, - 830, 830, 830, 830, -271, -271, -271, -271, -271, -271, -271, -271, -90, - -90, -90, -90, -90, -90, -90, -90, -853, -853, -853, -853, -853, -853, -853, - -853, 1469, 1469, 1469, 1469, 1469, 1469, 1469, 1469, 126, 126, 126, 126, - 126, 126, 126, 126, -1162, -1162, -1162, -1162, -1162, -1162, -1162, -1162, - -1618, -1618, -1618, -1618, -1618, -1618, -1618, -1618, -666, -666, -666, - -666, -666, -666, -666, -666, -320, -320, -320, -320, -320, -320, -320, - -320, -8, -8, -8, -8, -8, -8, -8, -8, 516, 516, 516, 516, 516, 516, 516, - 516, -1544, -1544, -1544, -1544, -1544, -1544, -1544, -1544, -282, -282, - -282, -282, -282, -282, -282, -282, 1491, 1491, 1491, 1491, 1491, 1491, - 1491, 1491, -1293, -1293, -1293, -1293, -1293, -1293, -1293, -1293, 1015, - 1015, 1015, 1015, 1015, 1015, 1015, 1015, -552, -552, -552, -552, -552, - -552, -552, -552, 652, 652, 652, 652, 652, 652, 652, 652, 1223, 1223, 1223, - 1223, 1223, 1223, 1223, 1223, -1571, -1571, -1571, -1571, -1571, -1571, - -1571, -1571, -205, -205, -205, -205, -205, -205, -205, -205, 411, 411, 411, - 411, 411, 411, 411, 411, -1542, -1542, -1542, -1542, -1542, -1542, -1542, - -1542, 608, 608, 608, 608, 608, 608, 608, 608, 732, 732, 732, 732, 732, 732, - 732, 732, 1017, 1017, 1017, 1017, 1017, 1017, 1017, 1017, -681, -681, -681, - -681, -681, -681, -681, -681, -130, -130, -130, -130, -130, -130, -130, - -130, -1602, -1602, -1602, -1602, -1602, -1602, -1602, -1602, 1458, 1458, - 1458, 1458, 1458, 1458, 1458, 1458, -829, -829, -829, -829, -829, -829, - -829, -829, 383, 383, 383, 383, 383, 383, 383, 383, 264, 264, 264, 264, 264, - 264, 264, 264, -1325, -1325, -1325, -1325, -1325, -1325, -1325, -1325, 573, - 573, 573, 573, 573, 573, 573, 573, 1468, 1468, 1468, 1468, 1468, 1468, 1468, - 1468, -1474, -1474, -1474, -1474, -1474, -1474, -1474, -1474, -1202, -1202, - -1202, -1202, -1202, -1202, -1202, -1202, 962, 962, 962, 962, 962, 962, 962, - 962, 182, 182, 182, 182, 182, 182, 182, 182, 1577, 1577, 1577, 1577, 1577, - 1577, 1577, 1577, 622, 622, 622, 622, 622, 622, 622, 622, -171, -171, -171, - -171, -171, -171, -171, -171, 202, 202, 202, 202, 202, 202, 202, 202, 287, - 287, 287, 287, 287, 287, 287, 287, 1422, 1422, 1422, 1422, 1422, 1422, 1422, - 1422, 1493, 1493, 1493, 1493, 1493, 1493, 1493, 1493, -1517, -1517, -1517, - -1517, -1517, -1517, -1517, -1517, -359, -359, -359, -359, -359, -359, -359, - -359, -758, -758, -758, -758, -758, -758, -758, -758}; + -1571, -1571, -1571, -1571, -1571, -1571, + /* For Len=4 */ + 1223, 1223, 1223, 1223, 652, 652, 652, 652, -552, -552, -552, -552, 1015, + 1015, 1015, 1015, -1293, -1293, -1293, -1293, 1491, 1491, 1491, 1491, -282, + -282, -282, -282, -1544, -1544, -1544, -1544, 516, 516, 516, 516, -8, -8, + -8, -8, -320, -320, -320, -320, -666, -666, -666, -666, -1618, -1618, -1618, + -1618, -1162, -1162, -1162, -1162, 126, 126, 126, 126, 1469, 1469, 1469, + 1469, -853, -853, -853, -853, -90, -90, -90, -90, -271, -271, -271, -271, + 830, 830, 830, 830, 107, 107, 107, 107, -1421, -1421, -1421, -1421, -247, + -247, -247, -247, -951, -951, -951, -951, -398, -398, -398, -398, 961, 961, + 961, 961, -1508, -1508, -1508, -1508, -725, -725, -725, -725, 448, 448, 448, + 448, -1065, -1065, -1065, -1065, 677, 677, 677, 677, -1275, -1275, -1275, + -1275, + /* + * For ntt Len=2 + * reorder zeta array, (1, 2, 3, 4) -> (3, 1, 4, 2) + * Transpose z[0], z[1], z[2], z[3] + * -> z[3], z[3], z[1], z[1], z[4], z[4], z[2], z[2] + */ + 555, 555, -1103, -1103, 843, 843, 430, 430, 1550, 1550, -1251, -1251, 105, + 105, 871, 871, 177, 177, 422, 422, -235, -235, 587, 587, 1574, 1574, -291, + -291, 1653, 1653, -460, -460, 1159, 1159, -246, -246, -147, -147, 778, 778, + -602, -602, -777, -777, 1119, 1119, 1483, 1483, -872, -872, -1590, -1590, + 349, 349, 644, 644, -156, -156, 418, 418, -75, -75, 329, 329, 603, 603, 817, + 817, 610, 610, 1097, 1097, -1465, -1465, 1322, 1322, 384, 384, -1285, -1285, + 1218, 1218, -1215, -1215, -1335, -1335, -136, -136, -1187, -1187, -874, + -874, -1659, -1659, 220, 220, -1278, -1278, -1185, -1185, 794, 794, -1530, + -1530, -870, -870, -1510, -1510, 478, 478, -854, -854, 996, 996, -108, -108, + 991, 991, -308, -308, 1522, 1522, 958, 958, 1628, 1628, -1460, -1460, + /* + * For intt Len=2, offset IZETA_NTT_OFFSET127 + * reorder zeta array, (1, 2, 3, 4) -> (3, 1, 4, 2) + * Transpose z[0], z[1], z[2], z[3] + * -> z[3], z[3], z[1], z[1], z[4], z[4], z[2], z[2] + */ + -1460, -1460, 1628, 1628, 958, 958, 1522, 1522, -308, -308, 991, 991, -108, + -108, 996, 996, -854, -854, 478, 478, -1510, -1510, -870, -870, -1530, + -1530, 794, 794, -1185, -1185, -1278, -1278, 220, 220, -1659, -1659, -874, + -874, -1187, -1187, -136, -136, -1335, -1335, -1215, -1215, 1218, 1218, + -1285, -1285, 384, 384, 1322, 1322, -1465, -1465, 1097, 1097, 610, 610, 817, + 817, 603, 603, 329, 329, -75, -75, 418, 418, -156, -156, 644, 644, 349, 349, + -1590, -1590, -872, -872, 1483, 1483, 1119, 1119, -777, -777, -602, -602, + 778, 778, -147, -147, -246, -246, 1159, 1159, -460, -460, 1653, 1653, -291, + -291, 1574, 1574, 587, 587, -235, -235, 422, 422, 177, 177, 871, 871, 105, + 105, -1251, -1251, 1550, 1550, 430, 430, 843, 843, -1103, -1103, 555, 555, + /* For intt Len=4 */ + -1275, -1275, -1275, -1275, 677, 677, 677, 677, -1065, -1065, -1065, -1065, + 448, 448, 448, 448, -725, -725, -725, -725, -1508, -1508, -1508, -1508, 961, + 961, 961, 961, -398, -398, -398, -398, -951, -951, -951, -951, -247, -247, + -247, -247, -1421, -1421, -1421, -1421, 107, 107, 107, 107, 830, 830, 830, + 830, -271, -271, -271, -271, -90, -90, -90, -90, -853, -853, -853, -853, + 1469, 1469, 1469, 1469, 126, 126, 126, 126, -1162, -1162, -1162, -1162, + -1618, -1618, -1618, -1618, -666, -666, -666, -666, -320, -320, -320, -320, + -8, -8, -8, -8, 516, 516, 516, 516, -1544, -1544, -1544, -1544, -282, -282, + -282, -282, 1491, 1491, 1491, 1491, -1293, -1293, -1293, -1293, 1015, 1015, + 1015, 1015, -552, -552, -552, -552, 652, 652, 652, 652, 1223, 1223, 1223, + 1223, + /* For intt Len=8 and others */ + -1571, -1571, -1571, -1571, -1571, -1571, -1571, -1571, -205, -205, -205, + -205, -205, -205, -205, -205, 411, 411, 411, 411, 411, 411, 411, 411, -1542, + -1542, -1542, -1542, -1542, -1542, -1542, -1542, 608, 608, 608, 608, 608, + 608, 608, 608, 732, 732, 732, 732, 732, 732, 732, 732, 1017, 1017, 1017, + 1017, 1017, 1017, 1017, 1017, -681, -681, -681, -681, -681, -681, -681, + -681, -130, -130, -130, -130, -130, -130, -130, -130, -1602, -1602, -1602, + -1602, -1602, -1602, -1602, -1602, 1458, 1458, 1458, 1458, 1458, 1458, 1458, + 1458, -829, -829, -829, -829, -829, -829, -829, -829, 383, 383, 383, 383, + 383, 383, 383, 383, 264, 264, 264, 264, 264, 264, 264, 264, -1325, -1325, + -1325, -1325, -1325, -1325, -1325, -1325, 573, 573, 573, 573, 573, 573, 573, + 573, 1468, 1468, 1468, 1468, 1468, 1468, 1468, 1468, -1474, -1474, -1474, + -1474, -1474, -1474, -1474, -1474, -1202, -1202, -1202, -1202, -1202, -1202, + -1202, -1202, 962, 962, 962, 962, 962, 962, 962, 962, 182, 182, 182, 182, + 182, 182, 182, 182, 1577, 1577, 1577, 1577, 1577, 1577, 1577, 1577, 622, + 622, 622, 622, 622, 622, 622, 622, -171, -171, -171, -171, -171, -171, -171, + -171, 202, 202, 202, 202, 202, 202, 202, 202, 287, 287, 287, 287, 287, 287, + 287, 287, 1422, 1422, 1422, 1422, 1422, 1422, 1422, 1422, 1493, 1493, 1493, + 1493, 1493, 1493, 1493, 1493, -1517, -1517, -1517, -1517, -1517, -1517, + -1517, -1517, -359, -359, -359, -359, -359, -359, -359, -359, -758, -758, + -758, -758, -758, -758, -758, -758}; #endif /* MLK_ARITH_BACKEND_PPC64LE_DEFAULT && \ !MLK_CONFIG_MULTILEVEL_NO_SHARED */ diff --git a/mlkem/src/native/ppc64le/src/consts.h b/mlkem/src/native/ppc64le/src/consts.h index 49f519d0c..df5d163f7 100644 --- a/mlkem/src/native/ppc64le/src/consts.h +++ b/mlkem/src/native/ppc64le/src/consts.h @@ -14,9 +14,7 @@ #define C1441_OFFSET 64 #define C1353_OFFSET 80 #define ZETA_NTT_OFFSET 96 -#define ZETA_NTT_OFFSET64 1104 -#define IZETA_NTT_OFFSET127 1616 -#define IZETA_NTT_OFFSET63 2128 +#define ZETA_INTT_OFFSET 1104 #ifndef __ASSEMBLER__ #define mlk_ppc_qdata MLK_NAMESPACE(ppc_qdata) diff --git a/mlkem/src/native/ppc64le/src/intt_ppc.S b/mlkem/src/native/ppc64le/src/intt_ppc.S index 817c8c299..65df15b99 100644 --- a/mlkem/src/native/ppc64le/src/intt_ppc.S +++ b/mlkem/src/native/ppc64le/src/intt_ppc.S @@ -35,6 +35,17 @@ #define V_ZETA 10 #define V1441 10 +.macro Compute_4Coeffs + vsubuhm 25, 8, 21 # r[j+len] - t + vsubuhm 26, 12, 22 # r[j+len] - t + vsubuhm 30, 16, 23 # r[j+len] - t + vsubuhm 31, 20, 24 # r[j+len] - t + vadduhm 8, 8, 21 # r[j+len] + t + vadduhm 12, 12, 22 # r[j+len] + t + vadduhm 16, 16, 23 # r[j+len] + t + vadduhm 20, 20, 24 # r[j+len] + t +.endm + .macro Load_4Coeffs start next step mr 9, \start # j add 10, 7, 9 # J + len*2 @@ -62,14 +73,64 @@ xxpermdi 32+23, 32+23, 32+23, 2 xxpermdi 32+24, 32+24, 32+24, 2 - vsubuhm 25, 8, 21 # r[j+len] - t - vsubuhm 26, 12, 22 # r[j+len] - t - vsubuhm 30, 16, 23 # r[j+len] - t - vsubuhm 31, 20, 24 # r[j+len] - t - vadduhm 8, 8, 21 # r[j+len] + t - vadduhm 12, 12, 22 # r[j+len] + t - vadduhm 16, 16, 23 # r[j+len] + t - vadduhm 20, 20, 24 # r[j+len] + t + Compute_4Coeffs +.endm + +# +# Load Coeffients and setup vectors +# aj0, aj1, ajlen2, ajlen3, aj4, aj5, ajlen6, ajlen7 +# aj8, aj9, ajlen10, ajlen11, aj12, aj13, ajlen14, ajlen15 +# +# a[j]= aj0, aj1, aj8, aj9, aj4, aj5, aj12, aj13 +# a[j+len]= ajlen2, ajlen3, ajlen10, ajlen11, ajlen6, ajlen7, ajlen14, ajlen15 +# +.macro Load_L24Coeffs + lxv 32+25, 0(5) # a[j], r[j+len] + lxv 32+26, 16(5) # a[j], r[j+len] + vmrgew 8, 25, 26 + vmrgow 21, 25, 26 + lxv 32+25, 32(5) # a[j], r[j+len] + lxv 32+26, 48(5) # a[j], r[j+len] + vmrgew 12, 25, 26 + vmrgow 22, 25, 26 + lxv 32+25, 64(5) # a[j], r[j+len] + lxv 32+26, 80(5) # a[j], r[j+len] + vmrgew 16, 25, 26 + vmrgow 23, 25, 26 + lxv 32+25, 96(5) # a[j], r[j+len] + lxv 32+26, 112(5) # a[j], r[j+len] + vmrgew 20, 25, 26 + vmrgow 24, 25, 26 +.endm + +# +# Permute +# rj0, rj1, rj2, rj3, rjlen4, rjlen5, rjlen6, rjlen7 +# rj8, rj9, rj10, rj11, rjlen12, rjlen13, rjlen14, rjlen15 +# +# to +# rjlen4 - rjlen7, rjlen12 - rjlen15 +# rj0 - rj4, rj8 - rj11 +# +.macro Load_L44Coeffs + lxv 10, 0(5) # rj0, rj1, rj2, rj3, + # rjlen4, rjlen5, rjlen6, rjlen7 + lxv 11, 16(5) # rj8, rj9, rj10, rj11 + # rjlen12, rjlen13, rjlen14, rjlen15 + xxpermdi 32+8, 11, 10, 0 # rjlen4 - rjlen7, rjlen12 - rjlen15 + xxpermdi 32+21, 11, 10, 3 # rj0 - rj4, rj8 - rj11 + lxv 10, 32(5) + lxv 11, 48(5) + xxpermdi 32+12, 11, 10, 0 + xxpermdi 32+22, 11, 10, 3 + lxv 10, 64(5) + lxv 11, 80(5) + xxpermdi 32+16, 11, 10, 0 + xxpermdi 32+23, 11, 10, 3 + lxv 10, 96(5) + lxv 11, 112(5) + xxpermdi 32+20, 11, 10, 0 + xxpermdi 32+24, 11, 10, 3 .endm .macro BREDUCE_4X _v0 _v1 _v2 _v3 @@ -130,7 +191,7 @@ .endm #----------------------------------- -# MREDUCE_4X(len, start, _vz0, _vz1, _vz2, _vz3) +# MREDUCE_4X(_vz0, _vz1, _vz2, _vz3, _vo0, _vo1, _vo2, _vo3) # .macro MREDUCE_4X _vz0 _vz1 _vz2 _vz3 _vo0 _vo1 _vo2 _vo3 # Modular multification bond by 2^16 * q in abs value @@ -209,34 +270,88 @@ stxv \_vs7, -16(3) .endm -.macro Write_Len2_4C _vs0 _vs1 _vs2 _vs3 - xxmrglw 32+12, \_vs0, 10 - xxmrghw 32+11, \_vs0, 10 - xxpermdi 10, 32+12, 32+11, 3 - xxmrglw 32+16, \_vs1, 11 - xxmrghw 32+15, \_vs1, 11 - xxpermdi 11, 32+16, 32+15, 3 - xxmrglw 32+12, \_vs2, 12 - xxmrghw 32+11, \_vs2, 12 - xxpermdi 12, 32+12, 32+11, 3 - xxmrglw 32+16, \_vs3, 13 - xxmrghw 32+15, \_vs3, 13 - xxpermdi 13, 32+16, 32+15, 3 - stxvd2x 10, 3, 9 - stxvd2x 11, 3, 16 - stxvd2x 12, 3, 18 - stxvd2x 13, 3, 20 +.macro PermWriteL44 + xxlor 32+14, 10, 10 + xxlor 32+19, 11, 11 + xxlor 32+24, 12, 12 + xxlor 32+29, 13, 13 + xxpermdi 32+10, 32+13, 32+14, 3 + xxpermdi 32+11, 32+13, 32+14, 0 + xxpermdi 32+12, 32+18, 32+19, 3 + xxpermdi 32+13, 32+18, 32+19, 0 + xxpermdi 32+14, 32+23, 32+24, 3 + xxpermdi 32+15, 32+23, 32+24, 0 + xxpermdi 32+16, 32+28, 32+29, 3 + xxpermdi 32+17, 32+28, 32+29, 0 + stxv 32+10, 0(5) + stxv 32+11, 16(5) + stxv 32+12, 32(5) + stxv 32+13, 48(5) + stxv 32+14, 64(5) + stxv 32+15, 80(5) + stxv 32+16, 96(5) + stxv 32+17, 112(5) +.endm + +.macro PermWriteL24 + xxlor 32+14, 10, 10 + xxlor 32+19, 11, 11 + xxlor 32+24, 12, 12 + xxlor 32+29, 13, 13 + vmrgew 10, 13, 14 + vmrgow 11, 13, 14 + vmrgew 12, 18, 19 + vmrgow 13, 18, 19 + vmrgew 14, 23, 24 + vmrgow 15, 23, 24 + vmrgew 16, 28, 29 + vmrgow 17, 28, 29 + stxv 32+10, 0(5) + stxv 32+11, 16(5) + stxv 32+12, 32(5) + stxv 32+13, 48(5) + stxv 32+14, 64(5) + stxv 32+15, 80(5) + stxv 32+16, 96(5) + stxv 32+17, 112(5) +.endm + +.macro INTT_REDUCE_L24 + Load_L24Coeffs + Compute_4Coeffs + BREDUCE_4X 4, 9, 13, 17 + xxlor 10, 32+4, 32+4 + xxlor 11, 32+9, 32+9 + xxlor 12, 32+13, 32+13 + xxlor 13, 32+17, 32+17 + Set_mont_consts + Load_next_4zetas + MREDUCE_4X V_Z0, V_Z1, V_Z2, V_Z3, 13, 18, 23, 28 + PermWriteL24 +.endm + +.macro INTT_REDUCE_L44 + Load_L44Coeffs + Compute_4Coeffs + BREDUCE_4X 4, 9, 13, 17 + xxlor 10, 32+4, 32+4 + xxlor 11, 32+9, 32+9 + xxlor 12, 32+13, 32+13 + xxlor 13, 32+17, 32+17 + Set_mont_consts + Load_next_4zetas + MREDUCE_4X V_Z0, V_Z1, V_Z2, V_Z3, 13, 18, 23, 28 + PermWriteL44 .endm -.macro Write_Len4_4C _vs0 _vs1 _vs2 _vs3 - xxpermdi 10, 10, \_vs0, 3 - xxpermdi 11, 11, \_vs1, 3 - xxpermdi 12, 12, \_vs2, 3 - xxpermdi 13, 13, \_vs3, 3 - stxvd2x 10, 3, 9 - stxvd2x 11, 3, 16 - stxvd2x 12, 3, 18 - stxvd2x 13, 3, 20 +.macro INTT_REDUCE_4X start next step + Load_4Coeffs \start, \next, \step + BREDUCE_4X 4, 9, 13, 17 + Write_B4C 32+4, 32+9, 32+13, 32+17 + Set_mont_consts + Load_next_4zetas + MREDUCE_4X V_Z0, V_Z1, V_Z2, V_Z3, 13, 18, 23, 28 + Write_M4C 32+13, 32+18, 32+23, 32+28 .endm # intt @@ -300,164 +415,85 @@ MLK_ASM_FN_SYMBOL(intt_ppc) vslw 9, 9, 10 xxlor 7, 32+9, 32+9 # V_25 syore at vs7 + # + # Montgomery reduce loops with constant 1441 + # + addi 14, 4, C1441_OFFSET + lvx V1441, 0, 14 + li 15, 4 # loops + mtctr 15 + + Set_mont_consts +intt_ppc__Loopf: + Reload_4coeffs + MREDUCE_4X V1441, V1441, V1441, V1441, 6, 7, 8, 9 + Reload_4coeffs + MREDUCE_4X V1441, V1441, V1441, V1441, 13, 18, 23, 28 + MWrite_8X 32+6, 32+7, 32+8, 32+9, 32+13, 32+18, 32+23, 32+28 + bdnz intt_ppc__Loopf + + addi 3, 3, -512 + .align 4 # # 1. len = 2, start = 0, 4, 8, 12,...244, 248, 252 # Update zetas vectors, each vector has 2 zetas - addi 14, 4, IZETA_NTT_OFFSET127 + addi 14, 4, ZETA_INTT_OFFSET li 7, 4 li 15, 4 mtctr 15 - li 5, 0 + mr 5, 3 intt_ppc__Loop2: - Load_4Coeffs 5, 16, 16 - BREDUCE_4X 4, 9, 13, 17 - xxlor 10, 32+4, 32+4 - xxlor 11, 32+9, 32+9 - xxlor 12, 32+13, 32+13 - xxlor 13, 32+17, 32+17 - Set_mont_consts - Load_next_4zetas - MREDUCE_4X V_Z0, V_Z1, V_Z2, V_Z3, 13, 18, 23, 28 - Write_Len2_4C 32+13, 32+18, 32+23, 32+28 - - addi 5, 5, 64 - - Load_4Coeffs 5, 16, 16 - BREDUCE_4X 4, 9, 13, 17 - xxlor 10, 32+4, 32+4 - xxlor 11, 32+9, 32+9 - xxlor 12, 32+13, 32+13 - xxlor 13, 32+17, 32+17 - Set_mont_consts - Load_next_4zetas - MREDUCE_4X V_Z0, V_Z1, V_Z2, V_Z3, 13, 18, 23, 28 - Write_Len2_4C 32+13, 32+18, 32+23, 32+28 - addi 5, 5, 64 + INTT_REDUCE_L24 + addi 5, 5, 128 bdnz intt_ppc__Loop2 .align 4 # # 2. len = 4, start = 0, 8, 16, 24,...232, 240, 248 - addi 14, 4, IZETA_NTT_OFFSET63 - li 5, 0 + mr 5, 3 li 7, 8 li 15, 4 # loops mtctr 15 intt_ppc__Loop4: - Load_4Coeffs 5, 16, 16 - BREDUCE_4X 4, 9, 13, 17 - xxlor 10, 32+4, 32+4 - xxlor 11, 32+9, 32+9 - xxlor 12, 32+13, 32+13 - xxlor 13, 32+17, 32+17 - Set_mont_consts - Load_next_4zetas - MREDUCE_4X V_Z0, V_Z1, V_Z2, V_Z3, 13, 18, 23, 28 - Write_Len4_4C 32+13, 32+18, 32+23, 32+28 - addi 5, 5, 64 - - Load_4Coeffs 5, 16, 16 - BREDUCE_4X 4, 9, 13, 17 - xxlor 10, 32+4, 32+4 - xxlor 11, 32+9, 32+9 - xxlor 12, 32+13, 32+13 - xxlor 13, 32+17, 32+17 - Set_mont_consts - Load_next_4zetas - MREDUCE_4X V_Z0, V_Z1, V_Z2, V_Z3, 13, 18, 23, 28 - Write_Len4_4C 32+13, 32+18, 32+23, 32+28 - addi 5, 5, 64 + INTT_REDUCE_L44 + addi 5, 5, 128 bdnz intt_ppc__Loop4 .align 4 # 3. len = 8, start = 0, 16, 32, 48,...208, 224, 240 - #addi 14, 14, 512 li 7, 16 li 5, 0 + li 15, 4 # loops + mtctr 15 - Load_4Coeffs 5, 32, 32 - BREDUCE_4X 4, 9, 13, 17 - Write_B4C 32+4, 32+9, 32+13, 32+17 - Set_mont_consts - Load_next_4zetas - MREDUCE_4X V_Z0, V_Z1, V_Z2, V_Z3, 13, 18, 23, 28 - Write_M4C 32+13, 32+18, 32+23, 32+28 - li 5, 128 - - Load_4Coeffs 5, 32, 32 - BREDUCE_4X 4, 9, 13, 17 - Write_B4C 32+4, 32+9, 32+13, 32+17 - Set_mont_consts - Load_next_4zetas - MREDUCE_4X V_Z0, V_Z1, V_Z2, V_Z3, 13, 18, 23, 28 - Write_M4C 32+13, 32+18, 32+23, 32+28 - li 5, 256 - - Load_4Coeffs 5, 32, 32 - BREDUCE_4X 4, 9, 13, 17 - Write_B4C 32+4, 32+9, 32+13, 32+17 - Set_mont_consts - Load_next_4zetas - MREDUCE_4X V_Z0, V_Z1, V_Z2, V_Z3, 13, 18, 23, 28 - Write_M4C 32+13, 32+18, 32+23, 32+28 - li 5, 384 - - Load_4Coeffs 5, 32, 32 - BREDUCE_4X 4, 9, 13, 17 - Write_B4C 32+4, 32+9, 32+13, 32+17 - Set_mont_consts - Load_next_4zetas - MREDUCE_4X V_Z0, V_Z1, V_Z2, V_Z3, 13, 18, 23, 28 - Write_M4C 32+13, 32+18, 32+23, 32+28 +intt_ppc__Loop8: + INTT_REDUCE_4X 5, 32, 32 + addi 5, 5, 128 + bdnz intt_ppc__Loop8 .align 4 # # 4. len = 16, start = 0, 32, 64,,...160, 192, 224 - #addi 14, 14, 768 li 5, 0 li 7, 32 - Load_4Coeffs 5, 64, 64 - BREDUCE_4X 4, 9, 13, 17 - Write_B4C 32+4, 32+9, 32+13, 32+17 - Set_mont_consts - Load_next_4zetas - MREDUCE_4X V_Z0, V_Z1, V_Z2, V_Z3, 13, 18, 23, 28 - Write_M4C 32+13, 32+18, 32+23, 32+28 + INTT_REDUCE_4X 5, 64, 64 + li 5, 16 - Load_4Coeffs 5, 64, 64 - BREDUCE_4X 4, 9, 13, 17 - Write_B4C 32+4, 32+9, 32+13, 32+17 - Set_mont_consts addi 14, 14, -64 - Load_next_4zetas - MREDUCE_4X V_Z0, V_Z1, V_Z2, V_Z3, 13, 18, 23, 28 - Write_M4C 32+13, 32+18, 32+23, 32+28 + INTT_REDUCE_4X 5, 64, 64 li 5, 256 - Load_4Coeffs 5, 64, 64 - BREDUCE_4X 4, 9, 13, 17 - Write_B4C 32+4, 32+9, 32+13, 32+17 - Set_mont_consts - Load_next_4zetas - MREDUCE_4X V_Z0, V_Z1, V_Z2, V_Z3, 13, 18, 23, 28 - Write_M4C 32+13, 32+18, 32+23, 32+28 + INTT_REDUCE_4X 5, 64, 64 li 5, 272 - Load_4Coeffs 5, 64, 64 - BREDUCE_4X 4, 9, 13, 17 - Write_B4C 32+4, 32+9, 32+13, 32+17 - Set_mont_consts addi 14, 14, -64 - Load_next_4zetas - MREDUCE_4X V_Z0, V_Z1, V_Z2, V_Z3, 13, 18, 23, 28 - Write_M4C 32+13, 32+18, 32+23, 32+28 + INTT_REDUCE_4X 5, 64, 64 .align 4 # # 5. len = 32, start = 0, 64, 128, 192 - #addi 14, 14, 896 li 5, 0 li 7, 64 @@ -503,7 +539,6 @@ intt_ppc__Loop4: .align 4 # # 6. len = 64, start = 0, 128 - #addi 14, 14, 960 li 5, 0 li 7, 128 Load_4Coeffs 5, 16, 16 @@ -546,7 +581,6 @@ intt_ppc__Loop4: .align 4 # 7. len = 128, start = 0 # - #addi 14, 14, 992 li 5, 0 # start li 7, 256 # len * 2 @@ -586,37 +620,6 @@ intt_ppc__Loop4: MREDUCE_4X V_ZETA, V_ZETA, V_ZETA, V_ZETA, 13, 18, 23, 28 Write_M4C 32+13, 32+18, 32+23, 32+28 -.align 4 - # - # Montgomery reduce loops with constant 1441 - # - addi 14, 4, C1441_OFFSET - lvx V1441, 0, 14 - - Reload_4coeffs - MREDUCE_4X V1441, V1441, V1441, V1441, 6, 7, 8, 9 - Reload_4coeffs - MREDUCE_4X V1441, V1441, V1441, V1441, 13, 18, 23, 28 - MWrite_8X 32+6, 32+7, 32+8, 32+9, 32+13, 32+18, 32+23, 32+28 - - Reload_4coeffs - MREDUCE_4X V1441, V1441, V1441, V1441, 6, 7, 8, 9 - Reload_4coeffs - MREDUCE_4X V1441, V1441, V1441, V1441, 13, 18, 23, 28 - MWrite_8X 32+6, 32+7, 32+8, 32+9, 32+13, 32+18, 32+23, 32+28 - - Reload_4coeffs - MREDUCE_4X V1441, V1441, V1441, V1441, 6, 7, 8, 9 - Reload_4coeffs - MREDUCE_4X V1441, V1441, V1441, V1441, 13, 18, 23, 28 - MWrite_8X 32+6, 32+7, 32+8, 32+9, 32+13, 32+18, 32+23, 32+28 - - Reload_4coeffs - MREDUCE_4X V1441, V1441, V1441, V1441, 6, 7, 8, 9 - Reload_4coeffs - MREDUCE_4X V1441, V1441, V1441, V1441, 13, 18, 23, 28 - MWrite_8X 32+6, 32+7, 32+8, 32+9, 32+13, 32+18, 32+23, 32+28 - lxv 32+20, 128(1) lxv 32+21, 144(1) lxv 32+22, 160(1) @@ -660,18 +663,3 @@ intt_ppc__Loop4: #endif /* MLK_ARITH_BACKEND_PPC64LE_DEFAULT && \ !MLK_CONFIG_MULTILEVEL_NO_SHARED */ - -/* To facilitate single-compilation-unit (SCU) builds, undefine all macros. - * Don't modify by hand -- this is auto-generated by scripts/autogen. */ -#undef V20159 -#undef V_25 -#undef V_26 -#undef V_MKQ -#undef V_QINV -#undef V_NMKQ -#undef V_Z0 -#undef V_Z1 -#undef V_Z2 -#undef V_Z3 -#undef V_ZETA -#undef V1441 diff --git a/mlkem/src/native/ppc64le/src/ntt_ppc.S b/mlkem/src/native/ppc64le/src/ntt_ppc.S index 83f42f9b8..70e7bf710 100644 --- a/mlkem/src/native/ppc64le/src/ntt_ppc.S +++ b/mlkem/src/native/ppc64le/src/ntt_ppc.S @@ -27,15 +27,7 @@ .machine "any" .text -# -# montgomery_reduce -# t = a * QINV -# t = (a - (int32_t)t*_MLKEM_Q) >> 16 -# -#----------------------------------- -# MREDUCE_4X(start, _vz0, _vz1, _vz2, _vz3) -# -.macro MREDUCE_4X start next step _vz0 _vz1 _vz2 _vz3 +.macro Load_4Coeffs start next step mr 9, \start add 10, 7, 9 # J + len*2 addi 16, 9, \next @@ -52,7 +44,74 @@ xxpermdi 32+18, 32+18, 32+18, 2 xxpermdi 32+23, 32+23, 32+23, 2 xxpermdi 32+28, 32+28, 32+28, 2 +.endm + +# +# Load Coeffients and setup vectors +# aj0, aj1, ajlen2, ajlen3, aj4, aj5, ajlen6, ajlen7 +# aj8, aj9, ajlen10, ajlen11, aj12, aj13, ajlen14, ajlen15 +# +# a[j]= aj0, aj1, aj8, aj9, aj4, aj5, aj12, aj13 +# a[j+len]= ajlen2, ajlen3, ajlen10, ajlen11, ajlen6, ajlen7, ajlen14, ajlen15 +# +.macro Load_L24Coeffs + lxv 32+25, 0(5) # a[j], r[j+len] + lxv 32+26, 16(5) # a[j], r[j+len] + vmrgew 13, 25, 26 + vmrgow 12, 25, 26 + lxv 32+25, 32(5) # a[j], r[j+len] + lxv 32+26, 48(5) # a[j], r[j+len] + vmrgew 18, 25, 26 + vmrgow 17, 25, 26 + lxv 32+25, 64(5) # a[j], r[j+len] + lxv 32+26, 80(5) # a[j], r[j+len] + vmrgew 23, 25, 26 + vmrgow 22, 25, 26 + lxv 32+25, 96(5) # a[j], r[j+len] + lxv 32+26, 112(5) # a[j], r[j+len] + vmrgew 28, 25, 26 + vmrgow 27, 25, 26 +.endm +# +# Permute +# rj0, rj1, rj2, rj3, rjlen4, rjlen5, rjlen6, rjlen7 +# rj8, rj9, rj10, rj11, rjlen12, rjlen13, rjlen14, rjlen15 +# +# to +# rjlen4 - rjlen7, rjlen12 - rjlen15 +# rj0 - rj4, rj8 - rj11 +# +.macro Load_L44Coeffs + lxv 1, 0(5) # rj0, rj1, rj2, rj3, + # rjlen4, rjlen5, rjlen6, rjlen7 + lxv 2, 16(5) # rj8, rj9, rj10, rj11 + # rjlen12, rjlen13, rjlen14, rjlen15 + xxpermdi 32+13, 2, 1, 0 # rjlen4 - rjlen7, rjlen12 - rjlen15 + xxpermdi 32+12, 2, 1, 3 # rj0 - rj4, rj8 - rj11 + lxv 3, 32(5) + lxv 4, 48(5) + xxpermdi 32+18, 4, 3, 0 + xxpermdi 32+17, 4, 3, 3 + lxv 1, 64(5) + lxv 2, 80(5) + xxpermdi 32+23, 2, 1, 0 + xxpermdi 32+22, 2, 1, 3 + lxv 3, 96(5) + lxv 4, 112(5) + xxpermdi 32+28, 4, 3, 0 + xxpermdi 32+27, 4, 3, 3 +.endm + +# +# montgomery_reduce +# t = a * QINV +# t = (a - (int32_t)t*_MLKEM_Q) >> 16 +# +#----------------------------------- +# MREDUCE_4X(_vz0, _vz1, _vz2, _vz3) +# +.macro MREDUCE_4X _vz0 _vz1 _vz2 _vz3 # fqmul = zeta * coefficient # Modular multification bond by 2^16 * q in abs value vmladduhm 15, 13, \_vz0, 3 @@ -81,6 +140,9 @@ vsrah 23, 25, 4 # >> 1 vsrah 28, 30, 4 # >> 1 +.endm + +.macro Load_4Aj lxvd2x 32+12, 3, 9 # r[j] lxvd2x 32+17, 3, 16 # r[j] lxvd2x 32+22, 3, 18 # r[j] @@ -89,7 +151,9 @@ xxpermdi 32+17, 32+17, 32+17, 2 xxpermdi 32+22, 32+22, 32+22, 2 xxpermdi 32+27, 32+27, 32+27, 2 +.endm +.macro Compute_4Coeffs # Since the result of the Montgomery multiplication is bounded # by q in absolute value. # Finally to complete the final update of the results with add/sub @@ -103,6 +167,13 @@ vadduhm 30, 28, 27 # r + t .endm +.macro NTT_MREDUCE_4X start next step _vz0 _vz1 _vz2 _vz3 + Load_4Coeffs \start, \next, \step + MREDUCE_4x \_vz0, \_vz1, \_vz2, \_vz3 + Load_4Aj + Compute_4Coeffs +.endm + .macro Write_One stxvx 32+15, 3, 9 stxvx 32+16, 3, 10 @@ -114,35 +185,44 @@ stxvx 32+31, 3, 21 .endm -.macro Write_Two - xxpermdi 32+17, 32+16, 32+15, 3 - xxpermdi 32+22, 32+21, 32+20, 3 - xxpermdi 32+27, 32+26, 32+25, 3 - xxpermdi 32+29, 32+31, 32+30, 3 - - stxvx 32+17, 3, 9 - stxvx 32+22, 3, 16 - stxvx 32+27, 3, 18 - stxvx 32+29, 3, 20 +.macro PermWriteL44 + Compute_4Coeffs + xxpermdi 0, 32+16, 32+15, 3 + xxpermdi 1, 32+16, 32+15, 0 + xxpermdi 2, 32+21, 32+20, 3 + xxpermdi 3, 32+21, 32+20, 0 + xxpermdi 4, 32+26, 32+25, 3 + xxpermdi 5, 32+26, 32+25, 0 + xxpermdi 6, 32+31, 32+30, 3 + xxpermdi 7, 32+31, 32+30, 0 + stxv 0, 0(5) + stxv 1, 16(5) + stxv 2, 32(5) + stxv 3, 48(5) + stxv 4, 64(5) + stxv 5, 80(5) + stxv 6, 96(5) + stxv 7, 112(5) .endm -.macro Write_Three - xxmrglw 32+14, 32+16, 32+15 - xxmrghw 32+13, 32+16, 32+15 - xxpermdi 32+17, 32+13, 32+14, 3 - xxmrglw 32+19, 32+21, 32+20 - xxmrghw 32+18, 32+21, 32+20 - xxpermdi 32+22, 32+18, 32+19, 3 - xxmrglw 32+14, 32+26, 32+25 - xxmrghw 32+13, 32+26, 32+25 - xxpermdi 32+27, 32+13, 32+14, 3 - xxmrglw 32+24, 32+31, 32+30 - xxmrghw 32+23, 32+31, 32+30 - xxpermdi 32+29, 32+23, 32+24, 3 - stxvx 32+17, 3, 9 - stxvx 32+22, 3, 16 - stxvx 32+27, 3, 18 - stxvx 32+29, 3, 20 +.macro PermWriteL24 + Compute_4Coeffs + vmrgew 10, 16, 15 + vmrgow 11, 16, 15 + vmrgew 12, 21, 20 + vmrgow 13, 21, 20 + vmrgew 14, 26, 25 + vmrgow 15, 26, 25 + vmrgew 16, 31, 30 + vmrgow 17, 31, 30 + stxv 32+10, 0(5) + stxv 32+11, 16(5) + stxv 32+12, 32(5) + stxv 32+13, 48(5) + stxv 32+14, 64(5) + stxv 32+15, 80(5) + stxv 32+16, 96(5) + stxv 32+17, 112(5) .endm .macro Load_next_4zetas @@ -206,16 +286,16 @@ MLK_ASM_FN_SYMBOL(ntt_ppc) lvx V_ZETA, 0, 14 addi 14, 14, 16 - MREDUCE_4X 5, 16, 16, V_ZETA, V_ZETA, V_ZETA, V_ZETA + NTT_MREDUCE_4X 5, 16, 16, V_ZETA, V_ZETA, V_ZETA, V_ZETA Write_One li 5, 64 - MREDUCE_4X 5, 16, 16, V_ZETA, V_ZETA, V_ZETA, V_ZETA + NTT_MREDUCE_4X 5, 16, 16, V_ZETA, V_ZETA, V_ZETA, V_ZETA Write_One li 5, 128 - MREDUCE_4X 5, 16, 16, V_ZETA, V_ZETA, V_ZETA, V_ZETA + NTT_MREDUCE_4X 5, 16, 16, V_ZETA, V_ZETA, V_ZETA, V_ZETA Write_One li 5, 192 - MREDUCE_4X 5, 16, 16, V_ZETA, V_ZETA, V_ZETA, V_ZETA + NTT_MREDUCE_4X 5, 16, 16, V_ZETA, V_ZETA, V_ZETA, V_ZETA Write_One .align 4 @@ -226,19 +306,19 @@ MLK_ASM_FN_SYMBOL(ntt_ppc) li 7, 128 lvx V_ZETA, 0, 14 addi 14, 14, 16 - MREDUCE_4X 5, 16, 16, V_ZETA, V_ZETA, V_ZETA, V_ZETA + NTT_MREDUCE_4X 5, 16, 16, V_ZETA, V_ZETA, V_ZETA, V_ZETA Write_One li 5, 64 - MREDUCE_4X 5, 16, 16, V_ZETA, V_ZETA, V_ZETA, V_ZETA + NTT_MREDUCE_4X 5, 16, 16, V_ZETA, V_ZETA, V_ZETA, V_ZETA Write_One li 5, 256 lvx V_ZETA, 0, 14 addi 14, 14, 16 - MREDUCE_4X 5, 16, 16, V_ZETA, V_ZETA, V_ZETA, V_ZETA + NTT_MREDUCE_4X 5, 16, 16, V_ZETA, V_ZETA, V_ZETA, V_ZETA Write_One li 5, 320 - MREDUCE_4X 5, 16, 16, V_ZETA, V_ZETA, V_ZETA, V_ZETA + NTT_MREDUCE_4X 5, 16, 16, V_ZETA, V_ZETA, V_ZETA, V_ZETA Write_One .align 4 @@ -249,28 +329,25 @@ MLK_ASM_FN_SYMBOL(ntt_ppc) li 7, 64 lvx V_ZETA, 0, 14 addi 14, 14, 16 - MREDUCE_4X 5, 16, 16, V_ZETA, V_ZETA, V_ZETA, V_ZETA + NTT_MREDUCE_4X 5, 16, 16, V_ZETA, V_ZETA, V_ZETA, V_ZETA Write_One - #li 5, 64 li 5, 128 lvx V_ZETA, 0, 14 addi 14, 14, 16 - MREDUCE_4X 5, 16, 16, V_ZETA, V_ZETA, V_ZETA, V_ZETA + NTT_MREDUCE_4X 5, 16, 16, V_ZETA, V_ZETA, V_ZETA, V_ZETA Write_One - #li 5, 128 li 5, 256 lvx V_ZETA, 0, 14 addi 14, 14, 16 - MREDUCE_4X 5, 16, 16, V_ZETA, V_ZETA, V_ZETA, V_ZETA + NTT_MREDUCE_4X 5, 16, 16, V_ZETA, V_ZETA, V_ZETA, V_ZETA Write_One - #li 5, 192 li 5, 384 lvx V_ZETA, 0, 14 addi 14, 14, 16 - MREDUCE_4X 5, 16, 16, V_ZETA, V_ZETA, V_ZETA, V_ZETA + NTT_MREDUCE_4X 5, 16, 16, V_ZETA, V_ZETA, V_ZETA, V_ZETA Write_One .align 4 @@ -280,18 +357,18 @@ MLK_ASM_FN_SYMBOL(ntt_ppc) li 5, 0 li 7, 32 Load_next_4zetas - MREDUCE_4X 5, 64, 64, V_Z0, V_Z1, V_Z2, V_Z3 + NTT_MREDUCE_4X 5, 64, 64, V_Z0, V_Z1, V_Z2, V_Z3 Write_One li 5, 16 - MREDUCE_4X 5, 64, 64, V_Z0, V_Z1, V_Z2, V_Z3 + NTT_MREDUCE_4X 5, 64, 64, V_Z0, V_Z1, V_Z2, V_Z3 Write_One Load_next_4zetas li 5, 256 - MREDUCE_4X 5, 64, 64, V_Z0, V_Z1, V_Z2, V_Z3 + NTT_MREDUCE_4X 5, 64, 64, V_Z0, V_Z1, V_Z2, V_Z3 Write_One li 5, 272 - MREDUCE_4X 5, 64, 64, V_Z0, V_Z1, V_Z2, V_Z3 + NTT_MREDUCE_4X 5, 64, 64, V_Z0, V_Z1, V_Z2, V_Z3 Write_One .align 4 @@ -301,22 +378,22 @@ MLK_ASM_FN_SYMBOL(ntt_ppc) li 5, 0 li 7, 16 Load_next_4zetas - MREDUCE_4X 5, 32, 32, V_Z0, V_Z1, V_Z2, V_Z3 + NTT_MREDUCE_4X 5, 32, 32, V_Z0, V_Z1, V_Z2, V_Z3 Write_One li 5, 128 Load_next_4zetas - MREDUCE_4X 5, 32, 32, V_Z0, V_Z1, V_Z2, V_Z3 + NTT_MREDUCE_4X 5, 32, 32, V_Z0, V_Z1, V_Z2, V_Z3 Write_One li 5, 256 Load_next_4zetas - MREDUCE_4X 5, 32, 32, V_Z0, V_Z1, V_Z2, V_Z3 + NTT_MREDUCE_4X 5, 32, 32, V_Z0, V_Z1, V_Z2, V_Z3 Write_One li 5, 384 Load_next_4zetas - MREDUCE_4X 5, 32, 32, V_Z0, V_Z1, V_Z2, V_Z3 + NTT_MREDUCE_4X 5, 32, 32, V_Z0, V_Z1, V_Z2, V_Z3 Write_One # @@ -324,19 +401,15 @@ MLK_ASM_FN_SYMBOL(ntt_ppc) # k += 32 li 15, 4 # loops mtctr 15 - li 5, 0 + mr 5, 3 li 7, 8 .align 4 ntt_ppc__Len4: Load_next_4zetas - MREDUCE_4X 5, 16, 16, V_Z0, V_Z1, V_Z2, V_Z3 - Write_Two - addi 5, 5, 64 - - Load_next_4zetas - MREDUCE_4X 5, 16, 16, V_Z0, V_Z1, V_Z2, V_Z3 - Write_Two - addi 5, 5, 64 + Load_L44Coeffs + MREDUCE_4X V_Z0, V_Z1, V_Z2, V_Z3 + PermWriteL44 + addi 5, 5, 128 bdnz ntt_ppc__Len4 @@ -345,23 +418,17 @@ ntt_ppc__Len4: # k += 64 # Update zetas vectors, each vector has 2 zetas - addi 14, 4, ZETA_NTT_OFFSET64 - li 15, 4 mtctr 15 - li 5, 0 + mr 5, 3 li 7, 4 .align 4 ntt_ppc__Len2: Load_next_4zetas - MREDUCE_4X 5, 16, 16, V_Z0, V_Z1, V_Z2, V_Z3 - Write_Three - addi 5, 5, 64 - - Load_next_4zetas - MREDUCE_4X 5, 16, 16, V_Z0, V_Z1, V_Z2, V_Z3 - Write_Three - addi 5, 5, 64 + Load_L24Coeffs + MREDUCE_4X V_Z0, V_Z1, V_Z2, V_Z3 + PermWriteL24 + addi 5, 5, 128 bdnz ntt_ppc__Len2 @@ -399,9 +466,3 @@ ntt_ppc__Len2: #endif /* MLK_ARITH_BACKEND_PPC64LE_DEFAULT && \ !MLK_CONFIG_MULTILEVEL_NO_SHARED */ - -/* To facilitate single-compilation-unit (SCU) builds, undefine all macros. - * Don't modify by hand -- this is auto-generated by scripts/autogen. */ -#undef V_QINV -#undef V_NMKQ -#undef V_ZETA