Skip to content

Commit 83f1c8b

Browse files
committed
HOL-Light: Add proof for x86_64 poly_tobytes
This commit adds the HOL-Light correctness proof of the x86_64 polynomial compression by @dkostic. - Ports awslabs/s2n-bignum#317 - Resolves #1382 Signed-off-by: Matthias J. Kannwischer <[email protected]>
1 parent e1b2dbc commit 83f1c8b

File tree

8 files changed

+624
-1
lines changed

8 files changed

+624
-1
lines changed

.github/workflows/hol_light.yml

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -199,6 +199,8 @@ jobs:
199199
needs: ["mlkem_specs.ml"]
200200
- name: mlkem_reduce
201201
needs: ["mlkem_specs.ml"]
202+
- name: mlkem_tobytes
203+
needs: ["mlkem_specs.ml"]
202204
name: x86_64 HOL Light proof for ${{ matrix.proof.name }}.S
203205
runs-on: pqcp-x64
204206
if: github.repository_owner == 'pq-code-package' && !github.event.pull_request.head.repo.fork

BIBLIOGRAPHY.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -264,6 +264,7 @@ source code and documentation.
264264
- [proofs/hol_light/x86/mlkem/mlkem_intt.S](proofs/hol_light/x86/mlkem/mlkem_intt.S)
265265
- [proofs/hol_light/x86/mlkem/mlkem_ntt.S](proofs/hol_light/x86/mlkem/mlkem_ntt.S)
266266
- [proofs/hol_light/x86/mlkem/mlkem_reduce.S](proofs/hol_light/x86/mlkem/mlkem_reduce.S)
267+
- [proofs/hol_light/x86/mlkem/mlkem_tobytes.S](proofs/hol_light/x86/mlkem/mlkem_tobytes.S)
267268

268269
### `SLOTHY`
269270

proofs/hol_light/README.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -105,6 +105,7 @@ The following x86_64 assembly routines used in mlkem-native are covered:
105105
* x86_64 inverse NTT: [mlkem_intt.S](x86/mlkem/mlkem_intt.S)
106106
* x86_64 base multiplications: [mlkem_poly_basemul_acc_montgomery_cached_k2.S](x86/mlkem/mlkem_poly_basemul_acc_montgomery_cached_k2.S) [mlkem_poly_basemul_acc_montgomery_cached_k3.S](x86/mlkem/mlkem_poly_basemul_acc_montgomery_cached_k3.S) [mlkem_poly_basemul_acc_montgomery_cached_k4.S](x86/mlkem/mlkem_poly_basemul_acc_montgomery_cached_k4.S)
107107
* x86_64 modular reduction: [mlkem_reduce.S](x86/mlkem/mlkem_reduce.S)
108+
* x86_64 polynomial compression: [mlkem_tobytes.S](arm/mlkem/mlkem_tobytes.S)
108109

109110
<!--- bibliography --->
110111
[^HYBRID]: Becker, Kannwischer: Hybrid scalar/vector implementations of Keccak and SPHINCS+ on AArch64, [https://eprint.iacr.org/2022/1243](https://eprint.iacr.org/2022/1243)

proofs/hol_light/x86/Makefile

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -65,7 +65,8 @@ OBJ = mlkem/mlkem_poly_basemul_acc_montgomery_cached_k2.o \
6565
mlkem/mlkem_poly_basemul_acc_montgomery_cached_k4.o \
6666
mlkem/mlkem_ntt.o \
6767
mlkem/mlkem_intt.o \
68-
mlkem/mlkem_reduce.o
68+
mlkem/mlkem_reduce.o \
69+
mlkem/mlkem_tobytes.o
6970

7071
# Build object files from assembly sources
7172
$(OBJ): %.o : %.S
Lines changed: 179 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,179 @@
1+
/*
2+
* Copyright (c) The mlkem-native project authors
3+
* SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT
4+
*/
5+
6+
/* References
7+
* ==========
8+
*
9+
* - [REF_AVX2]
10+
* CRYSTALS-Kyber optimized AVX2 implementation
11+
* Bos, Ducas, Kiltz, Lepoint, Lyubashevsky, Schanck, Schwabe, Seiler, Stehlé
12+
* https://github.com/pq-crystals/kyber/tree/main/avx2
13+
*/
14+
15+
/*
16+
* This file is derived from the public domain
17+
* AVX2 Kyber implementation @[REF_AVX2].
18+
*/
19+
20+
21+
22+
/*
23+
* WARNING: This file is auto-derived from the mlkem-native source file
24+
* dev/x86_64/src/ntttobytes.S using scripts/simpasm. Do not modify it directly.
25+
*/
26+
27+
#if defined(__ELF__)
28+
.section .note.GNU-stack,"",@progbits
29+
#endif
30+
31+
.text
32+
.balign 4
33+
#ifdef __APPLE__
34+
.global _PQCP_MLKEM_NATIVE_MLKEM768_ntttobytes_avx2
35+
_PQCP_MLKEM_NATIVE_MLKEM768_ntttobytes_avx2:
36+
#else
37+
.global PQCP_MLKEM_NATIVE_MLKEM768_ntttobytes_avx2
38+
PQCP_MLKEM_NATIVE_MLKEM768_ntttobytes_avx2:
39+
#endif
40+
41+
.cfi_startproc
42+
endbr64
43+
movl $0xd010d01, %eax # imm = 0xD010D01
44+
vmovd %eax, %xmm0
45+
vpbroadcastd %xmm0, %ymm0
46+
vmovdqa (%rsi), %ymm5
47+
vmovdqa 0x20(%rsi), %ymm6
48+
vmovdqa 0x40(%rsi), %ymm7
49+
vmovdqa 0x60(%rsi), %ymm8
50+
vmovdqa 0x80(%rsi), %ymm9
51+
vmovdqa 0xa0(%rsi), %ymm10
52+
vmovdqa 0xc0(%rsi), %ymm11
53+
vmovdqa 0xe0(%rsi), %ymm12
54+
vpsllw $0xc, %ymm6, %ymm4
55+
vpor %ymm4, %ymm5, %ymm4
56+
vpsrlw $0x4, %ymm6, %ymm5
57+
vpsllw $0x8, %ymm7, %ymm6
58+
vpor %ymm5, %ymm6, %ymm5
59+
vpsrlw $0x8, %ymm7, %ymm6
60+
vpsllw $0x4, %ymm8, %ymm7
61+
vpor %ymm6, %ymm7, %ymm6
62+
vpsllw $0xc, %ymm10, %ymm7
63+
vpor %ymm7, %ymm9, %ymm7
64+
vpsrlw $0x4, %ymm10, %ymm8
65+
vpsllw $0x8, %ymm11, %ymm9
66+
vpor %ymm8, %ymm9, %ymm8
67+
vpsrlw $0x8, %ymm11, %ymm9
68+
vpsllw $0x4, %ymm12, %ymm10
69+
vpor %ymm9, %ymm10, %ymm9
70+
vpslld $0x10, %ymm5, %ymm3
71+
vpblendw $0xaa, %ymm3, %ymm4, %ymm3 # ymm3 = ymm4[0],ymm3[1],ymm4[2],ymm3[3],ymm4[4],ymm3[5],ymm4[6],ymm3[7],ymm4[8],ymm3[9],ymm4[10],ymm3[11],ymm4[12],ymm3[13],ymm4[14],ymm3[15]
72+
vpsrld $0x10, %ymm4, %ymm4
73+
vpblendw $0xaa, %ymm5, %ymm4, %ymm5 # ymm5 = ymm4[0],ymm5[1],ymm4[2],ymm5[3],ymm4[4],ymm5[5],ymm4[6],ymm5[7],ymm4[8],ymm5[9],ymm4[10],ymm5[11],ymm4[12],ymm5[13],ymm4[14],ymm5[15]
74+
vpslld $0x10, %ymm7, %ymm4
75+
vpblendw $0xaa, %ymm4, %ymm6, %ymm4 # ymm4 = ymm6[0],ymm4[1],ymm6[2],ymm4[3],ymm6[4],ymm4[5],ymm6[6],ymm4[7],ymm6[8],ymm4[9],ymm6[10],ymm4[11],ymm6[12],ymm4[13],ymm6[14],ymm4[15]
76+
vpsrld $0x10, %ymm6, %ymm6
77+
vpblendw $0xaa, %ymm7, %ymm6, %ymm7 # ymm7 = ymm6[0],ymm7[1],ymm6[2],ymm7[3],ymm6[4],ymm7[5],ymm6[6],ymm7[7],ymm6[8],ymm7[9],ymm6[10],ymm7[11],ymm6[12],ymm7[13],ymm6[14],ymm7[15]
78+
vpslld $0x10, %ymm9, %ymm6
79+
vpblendw $0xaa, %ymm6, %ymm8, %ymm6 # ymm6 = ymm8[0],ymm6[1],ymm8[2],ymm6[3],ymm8[4],ymm6[5],ymm8[6],ymm6[7],ymm8[8],ymm6[9],ymm8[10],ymm6[11],ymm8[12],ymm6[13],ymm8[14],ymm6[15]
80+
vpsrld $0x10, %ymm8, %ymm8
81+
vpblendw $0xaa, %ymm9, %ymm8, %ymm9 # ymm9 = ymm8[0],ymm9[1],ymm8[2],ymm9[3],ymm8[4],ymm9[5],ymm8[6],ymm9[7],ymm8[8],ymm9[9],ymm8[10],ymm9[11],ymm8[12],ymm9[13],ymm8[14],ymm9[15]
82+
vmovsldup %ymm4, %ymm8 # ymm8 = ymm4[0,0,2,2,4,4,6,6]
83+
vpblendd $0xaa, %ymm8, %ymm3, %ymm8 # ymm8 = ymm3[0],ymm8[1],ymm3[2],ymm8[3],ymm3[4],ymm8[5],ymm3[6],ymm8[7]
84+
vpsrlq $0x20, %ymm3, %ymm3
85+
vpblendd $0xaa, %ymm4, %ymm3, %ymm4 # ymm4 = ymm3[0],ymm4[1],ymm3[2],ymm4[3],ymm3[4],ymm4[5],ymm3[6],ymm4[7]
86+
vmovsldup %ymm5, %ymm3 # ymm3 = ymm5[0,0,2,2,4,4,6,6]
87+
vpblendd $0xaa, %ymm3, %ymm6, %ymm3 # ymm3 = ymm6[0],ymm3[1],ymm6[2],ymm3[3],ymm6[4],ymm3[5],ymm6[6],ymm3[7]
88+
vpsrlq $0x20, %ymm6, %ymm6
89+
vpblendd $0xaa, %ymm5, %ymm6, %ymm5 # ymm5 = ymm6[0],ymm5[1],ymm6[2],ymm5[3],ymm6[4],ymm5[5],ymm6[6],ymm5[7]
90+
vmovsldup %ymm9, %ymm6 # ymm6 = ymm9[0,0,2,2,4,4,6,6]
91+
vpblendd $0xaa, %ymm6, %ymm7, %ymm6 # ymm6 = ymm7[0],ymm6[1],ymm7[2],ymm6[3],ymm7[4],ymm6[5],ymm7[6],ymm6[7]
92+
vpsrlq $0x20, %ymm7, %ymm7
93+
vpblendd $0xaa, %ymm9, %ymm7, %ymm9 # ymm9 = ymm7[0],ymm9[1],ymm7[2],ymm9[3],ymm7[4],ymm9[5],ymm7[6],ymm9[7]
94+
vpunpcklqdq %ymm3, %ymm8, %ymm7 # ymm7 = ymm8[0],ymm3[0],ymm8[2],ymm3[2]
95+
vpunpckhqdq %ymm3, %ymm8, %ymm3 # ymm3 = ymm8[1],ymm3[1],ymm8[3],ymm3[3]
96+
vpunpcklqdq %ymm4, %ymm6, %ymm8 # ymm8 = ymm6[0],ymm4[0],ymm6[2],ymm4[2]
97+
vpunpckhqdq %ymm4, %ymm6, %ymm4 # ymm4 = ymm6[1],ymm4[1],ymm6[3],ymm4[3]
98+
vpunpcklqdq %ymm9, %ymm5, %ymm6 # ymm6 = ymm5[0],ymm9[0],ymm5[2],ymm9[2]
99+
vpunpckhqdq %ymm9, %ymm5, %ymm9 # ymm9 = ymm5[1],ymm9[1],ymm5[3],ymm9[3]
100+
vperm2i128 $0x20, %ymm8, %ymm7, %ymm5 # ymm5 = ymm7[0,1],ymm8[0,1]
101+
vperm2i128 $0x31, %ymm8, %ymm7, %ymm8 # ymm8 = ymm7[2,3],ymm8[2,3]
102+
vperm2i128 $0x20, %ymm3, %ymm6, %ymm7 # ymm7 = ymm6[0,1],ymm3[0,1]
103+
vperm2i128 $0x31, %ymm3, %ymm6, %ymm3 # ymm3 = ymm6[2,3],ymm3[2,3]
104+
vperm2i128 $0x20, %ymm9, %ymm4, %ymm6 # ymm6 = ymm4[0,1],ymm9[0,1]
105+
vperm2i128 $0x31, %ymm9, %ymm4, %ymm9 # ymm9 = ymm4[2,3],ymm9[2,3]
106+
vmovdqu %ymm5, (%rdi)
107+
vmovdqu %ymm7, 0x20(%rdi)
108+
vmovdqu %ymm6, 0x40(%rdi)
109+
vmovdqu %ymm8, 0x60(%rdi)
110+
vmovdqu %ymm3, 0x80(%rdi)
111+
vmovdqu %ymm9, 0xa0(%rdi)
112+
vmovdqa 0x100(%rsi), %ymm5
113+
vmovdqa 0x120(%rsi), %ymm6
114+
vmovdqa 0x140(%rsi), %ymm7
115+
vmovdqa 0x160(%rsi), %ymm8
116+
vmovdqa 0x180(%rsi), %ymm9
117+
vmovdqa 0x1a0(%rsi), %ymm10
118+
vmovdqa 0x1c0(%rsi), %ymm11
119+
vmovdqa 0x1e0(%rsi), %ymm12
120+
vpsllw $0xc, %ymm6, %ymm4
121+
vpor %ymm4, %ymm5, %ymm4
122+
vpsrlw $0x4, %ymm6, %ymm5
123+
vpsllw $0x8, %ymm7, %ymm6
124+
vpor %ymm5, %ymm6, %ymm5
125+
vpsrlw $0x8, %ymm7, %ymm6
126+
vpsllw $0x4, %ymm8, %ymm7
127+
vpor %ymm6, %ymm7, %ymm6
128+
vpsllw $0xc, %ymm10, %ymm7
129+
vpor %ymm7, %ymm9, %ymm7
130+
vpsrlw $0x4, %ymm10, %ymm8
131+
vpsllw $0x8, %ymm11, %ymm9
132+
vpor %ymm8, %ymm9, %ymm8
133+
vpsrlw $0x8, %ymm11, %ymm9
134+
vpsllw $0x4, %ymm12, %ymm10
135+
vpor %ymm9, %ymm10, %ymm9
136+
vpslld $0x10, %ymm5, %ymm3
137+
vpblendw $0xaa, %ymm3, %ymm4, %ymm3 # ymm3 = ymm4[0],ymm3[1],ymm4[2],ymm3[3],ymm4[4],ymm3[5],ymm4[6],ymm3[7],ymm4[8],ymm3[9],ymm4[10],ymm3[11],ymm4[12],ymm3[13],ymm4[14],ymm3[15]
138+
vpsrld $0x10, %ymm4, %ymm4
139+
vpblendw $0xaa, %ymm5, %ymm4, %ymm5 # ymm5 = ymm4[0],ymm5[1],ymm4[2],ymm5[3],ymm4[4],ymm5[5],ymm4[6],ymm5[7],ymm4[8],ymm5[9],ymm4[10],ymm5[11],ymm4[12],ymm5[13],ymm4[14],ymm5[15]
140+
vpslld $0x10, %ymm7, %ymm4
141+
vpblendw $0xaa, %ymm4, %ymm6, %ymm4 # ymm4 = ymm6[0],ymm4[1],ymm6[2],ymm4[3],ymm6[4],ymm4[5],ymm6[6],ymm4[7],ymm6[8],ymm4[9],ymm6[10],ymm4[11],ymm6[12],ymm4[13],ymm6[14],ymm4[15]
142+
vpsrld $0x10, %ymm6, %ymm6
143+
vpblendw $0xaa, %ymm7, %ymm6, %ymm7 # ymm7 = ymm6[0],ymm7[1],ymm6[2],ymm7[3],ymm6[4],ymm7[5],ymm6[6],ymm7[7],ymm6[8],ymm7[9],ymm6[10],ymm7[11],ymm6[12],ymm7[13],ymm6[14],ymm7[15]
144+
vpslld $0x10, %ymm9, %ymm6
145+
vpblendw $0xaa, %ymm6, %ymm8, %ymm6 # ymm6 = ymm8[0],ymm6[1],ymm8[2],ymm6[3],ymm8[4],ymm6[5],ymm8[6],ymm6[7],ymm8[8],ymm6[9],ymm8[10],ymm6[11],ymm8[12],ymm6[13],ymm8[14],ymm6[15]
146+
vpsrld $0x10, %ymm8, %ymm8
147+
vpblendw $0xaa, %ymm9, %ymm8, %ymm9 # ymm9 = ymm8[0],ymm9[1],ymm8[2],ymm9[3],ymm8[4],ymm9[5],ymm8[6],ymm9[7],ymm8[8],ymm9[9],ymm8[10],ymm9[11],ymm8[12],ymm9[13],ymm8[14],ymm9[15]
148+
vmovsldup %ymm4, %ymm8 # ymm8 = ymm4[0,0,2,2,4,4,6,6]
149+
vpblendd $0xaa, %ymm8, %ymm3, %ymm8 # ymm8 = ymm3[0],ymm8[1],ymm3[2],ymm8[3],ymm3[4],ymm8[5],ymm3[6],ymm8[7]
150+
vpsrlq $0x20, %ymm3, %ymm3
151+
vpblendd $0xaa, %ymm4, %ymm3, %ymm4 # ymm4 = ymm3[0],ymm4[1],ymm3[2],ymm4[3],ymm3[4],ymm4[5],ymm3[6],ymm4[7]
152+
vmovsldup %ymm5, %ymm3 # ymm3 = ymm5[0,0,2,2,4,4,6,6]
153+
vpblendd $0xaa, %ymm3, %ymm6, %ymm3 # ymm3 = ymm6[0],ymm3[1],ymm6[2],ymm3[3],ymm6[4],ymm3[5],ymm6[6],ymm3[7]
154+
vpsrlq $0x20, %ymm6, %ymm6
155+
vpblendd $0xaa, %ymm5, %ymm6, %ymm5 # ymm5 = ymm6[0],ymm5[1],ymm6[2],ymm5[3],ymm6[4],ymm5[5],ymm6[6],ymm5[7]
156+
vmovsldup %ymm9, %ymm6 # ymm6 = ymm9[0,0,2,2,4,4,6,6]
157+
vpblendd $0xaa, %ymm6, %ymm7, %ymm6 # ymm6 = ymm7[0],ymm6[1],ymm7[2],ymm6[3],ymm7[4],ymm6[5],ymm7[6],ymm6[7]
158+
vpsrlq $0x20, %ymm7, %ymm7
159+
vpblendd $0xaa, %ymm9, %ymm7, %ymm9 # ymm9 = ymm7[0],ymm9[1],ymm7[2],ymm9[3],ymm7[4],ymm9[5],ymm7[6],ymm9[7]
160+
vpunpcklqdq %ymm3, %ymm8, %ymm7 # ymm7 = ymm8[0],ymm3[0],ymm8[2],ymm3[2]
161+
vpunpckhqdq %ymm3, %ymm8, %ymm3 # ymm3 = ymm8[1],ymm3[1],ymm8[3],ymm3[3]
162+
vpunpcklqdq %ymm4, %ymm6, %ymm8 # ymm8 = ymm6[0],ymm4[0],ymm6[2],ymm4[2]
163+
vpunpckhqdq %ymm4, %ymm6, %ymm4 # ymm4 = ymm6[1],ymm4[1],ymm6[3],ymm4[3]
164+
vpunpcklqdq %ymm9, %ymm5, %ymm6 # ymm6 = ymm5[0],ymm9[0],ymm5[2],ymm9[2]
165+
vpunpckhqdq %ymm9, %ymm5, %ymm9 # ymm9 = ymm5[1],ymm9[1],ymm5[3],ymm9[3]
166+
vperm2i128 $0x20, %ymm8, %ymm7, %ymm5 # ymm5 = ymm7[0,1],ymm8[0,1]
167+
vperm2i128 $0x31, %ymm8, %ymm7, %ymm8 # ymm8 = ymm7[2,3],ymm8[2,3]
168+
vperm2i128 $0x20, %ymm3, %ymm6, %ymm7 # ymm7 = ymm6[0,1],ymm3[0,1]
169+
vperm2i128 $0x31, %ymm3, %ymm6, %ymm3 # ymm3 = ymm6[2,3],ymm3[2,3]
170+
vperm2i128 $0x20, %ymm9, %ymm4, %ymm6 # ymm6 = ymm4[0,1],ymm9[0,1]
171+
vperm2i128 $0x31, %ymm9, %ymm4, %ymm9 # ymm9 = ymm4[2,3],ymm9[2,3]
172+
vmovdqu %ymm5, 0xc0(%rdi)
173+
vmovdqu %ymm7, 0xe0(%rdi)
174+
vmovdqu %ymm6, 0x100(%rdi)
175+
vmovdqu %ymm8, 0x120(%rdi)
176+
vmovdqu %ymm3, 0x140(%rdi)
177+
vmovdqu %ymm9, 0x160(%rdi)
178+
retq
179+
.cfi_endproc

proofs/hol_light/x86/proofs/dump_bytecode.ml

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -28,3 +28,7 @@ print_string "==== bytecode end =====================================\n\n";;
2828
print_string "=== bytecode start: x86/mlkem/mlkem_reduce.o ===\n";;
2929
print_literal_from_elf "x86/mlkem/mlkem_reduce.o";;
3030
print_string "==== bytecode end =====================================\n\n";;
31+
32+
print_string "=== bytecode start: x86/mlkem/mlkem_tobytes.o ===\n";;
33+
print_literal_from_elf "x86/mlkem/mlkem_tobytes.o";;
34+
print_string "==== bytecode end =====================================\n\n";;

0 commit comments

Comments
 (0)