-
Notifications
You must be signed in to change notification settings - Fork 1.3k
Description
Hello,
I'm using this function from PQClean:
void
PQCLEAN_FALCON512_CLEAN_prng_init(prng *p, inner_shake256_context *src) {
/*
* To ensure reproducibility for a given seed, we
* must enforce little-endian interpretation of
* the state words.
*/
uint8_t tmp[56];
uint64_t th, tl;
int i;
uint32_t *d32 = (uint32_t *) p->state.d;
uint64_t *d64 = (uint64_t *) p->state.d;
inner_shake256_extract(src, tmp, 56);
for (i = 0; i < 14; i ++) {
uint32_t w;
w = (uint32_t)tmp[(i << 2) + 0]
| ((uint32_t)tmp[(i << 2) + 1] << 8)
| ((uint32_t)tmp[(i << 2) + 2] << 16)
| ((uint32_t)tmp[(i << 2) + 3] << 24);
d32[i] = w;
}
tl = d32[48 / sizeof(uint32_t)];
th = d32[52 / sizeof(uint32_t)];
d64[48 / sizeof(uint64_t)] = tl + (th << 32);
PQCLEAN_FALCON512_CLEAN_prng_refill(p);
}
When running my code in my x86_64 pc (with both O2 and O0 flags) everything works fine.
However, when using O2 flag and running the code in gem5 simulator, I got a segmentation fault.
First, I noted that with O2 flag the for (i = 0; i < 14; i ++) is auto-vectorized.
After, I used remote gdb in gem5 to debug my code. Then, I discover that the segmentation fault was caused by this line:
22768: 02b70187 vlm.v v3,(a4)
This a4 comes from:
2275c: 8ec18713 addi a4,gp,-1812 # 390ec <__TMC_END__+0x4>
Additionally, I tried another code with the same vlm.v instruction and the same vtype and vl configurations in gem5 (but with a different memory address) and no segmentation fault happened (only to verify if it's not a internal gem5 error).
What do you think about this issue? If you need any more information from my setup/code, feel free to ask me (I can share anything from this code).
O0 disassembly code from this function:
000000000002f430 <PQCLEAN_FALCON512_CLEAN_prng_init>:
2f430: 7175 addi sp,sp,-144
2f432: e506 sd ra,136(sp)
2f434: e122 sd s0,128(sp)
2f436: 0900 addi s0,sp,144
2f438: f6a43c23 sd a0,-136(s0)
2f43c: f6b43823 sd a1,-144(s0)
2f440: f7843783 ld a5,-136(s0)
2f444: 20878793 addi a5,a5,520
2f448: fef43023 sd a5,-32(s0)
2f44c: f7843783 ld a5,-136(s0)
2f450: 20878793 addi a5,a5,520
2f454: fcf43c23 sd a5,-40(s0)
2f458: f8840793 addi a5,s0,-120
2f45c: f7043603 ld a2,-144(s0)
2f460: 03800593 li a1,56
2f464: 853e mv a0,a5
2f466: 963e40ef jal 13dc8 <shake256_inc_squeeze>
2f46a: fe042623 sw zero,-20(s0)
2f46e: a045 j 2f50e <PQCLEAN_FALCON512_CLEAN_prng_init+0xde>
2f470: fec42783 lw a5,-20(s0)
2f474: 0027979b slliw a5,a5,0x2
2f478: 2781 sext.w a5,a5
2f47a: 17c1 addi a5,a5,-16
2f47c: 97a2 add a5,a5,s0
2f47e: f987c783 lbu a5,-104(a5)
2f482: 0007871b sext.w a4,a5
2f486: fec42783 lw a5,-20(s0)
2f48a: 0027979b slliw a5,a5,0x2
2f48e: 2781 sext.w a5,a5
2f490: 2785 addiw a5,a5,1
2f492: 2781 sext.w a5,a5
2f494: 17c1 addi a5,a5,-16
2f496: 97a2 add a5,a5,s0
2f498: f987c783 lbu a5,-104(a5)
2f49c: 2781 sext.w a5,a5
2f49e: 0087979b slliw a5,a5,0x8
2f4a2: 2781 sext.w a5,a5
2f4a4: 8fd9 or a5,a5,a4
2f4a6: 0007871b sext.w a4,a5
2f4aa: fec42783 lw a5,-20(s0)
2f4ae: 0027979b slliw a5,a5,0x2
2f4b2: 2781 sext.w a5,a5
2f4b4: 2789 addiw a5,a5,2
2f4b6: 2781 sext.w a5,a5
2f4b8: 17c1 addi a5,a5,-16
2f4ba: 97a2 add a5,a5,s0
2f4bc: f987c783 lbu a5,-104(a5)
2f4c0: 2781 sext.w a5,a5
2f4c2: 0107979b slliw a5,a5,0x10
2f4c6: 2781 sext.w a5,a5
2f4c8: 8fd9 or a5,a5,a4
2f4ca: 0007871b sext.w a4,a5
2f4ce: fec42783 lw a5,-20(s0)
2f4d2: 0027979b slliw a5,a5,0x2
2f4d6: 2781 sext.w a5,a5
2f4d8: 278d addiw a5,a5,3
2f4da: 2781 sext.w a5,a5
2f4dc: 17c1 addi a5,a5,-16
2f4de: 97a2 add a5,a5,s0
2f4e0: f987c783 lbu a5,-104(a5)
2f4e4: 2781 sext.w a5,a5
2f4e6: 0187979b slliw a5,a5,0x18
2f4ea: 2781 sext.w a5,a5
2f4ec: 8fd9 or a5,a5,a4
2f4ee: fcf42223 sw a5,-60(s0)
2f4f2: fec42783 lw a5,-20(s0)
2f4f6: 078a slli a5,a5,0x2
2f4f8: fe043703 ld a4,-32(s0)
2f4fc: 97ba add a5,a5,a4
2f4fe: fc442703 lw a4,-60(s0)
2f502: c398 sw a4,0(a5)
2f504: fec42783 lw a5,-20(s0)
2f508: 2785 addiw a5,a5,1
2f50a: fef42623 sw a5,-20(s0)
2f50e: fec42783 lw a5,-20(s0)
2f512: 0007871b sext.w a4,a5
2f516: 47b5 li a5,13
2f518: f4e7dce3 bge a5,a4,2f470 <PQCLEAN_FALCON512_CLEAN_prng_init+0x40>
2f51c: fe043783 ld a5,-32(s0)
2f520: 03078793 addi a5,a5,48
2f524: 439c lw a5,0(a5)
2f526: 1782 slli a5,a5,0x20
2f528: 9381 srli a5,a5,0x20
2f52a: fcf43823 sd a5,-48(s0)
2f52e: fe043783 ld a5,-32(s0)
2f532: 03478793 addi a5,a5,52
2f536: 439c lw a5,0(a5)
2f538: 1782 slli a5,a5,0x20
2f53a: 9381 srli a5,a5,0x20
2f53c: fcf43423 sd a5,-56(s0)
2f540: fc843783 ld a5,-56(s0)
2f544: 02079693 slli a3,a5,0x20
2f548: fd843783 ld a5,-40(s0)
2f54c: 03078793 addi a5,a5,48
2f550: fd043703 ld a4,-48(s0)
2f554: 9736 add a4,a4,a3
2f556: e398 sd a4,0(a5)
2f558: f7843503 ld a0,-136(s0)
2f55c: 010000ef jal 2f56c <PQCLEAN_FALCON512_CLEAN_prng_refill>
2f560: 0001 nop
2f562: 0001 nop
2f564: 60aa ld ra,136(sp)
2f566: 640a ld s0,128(sp)
2f568: 6149 addi sp,sp,144
2f56a: 8082 ret
O2 disassembly code from this function:
0000000000022748 <PQCLEAN_FALCON512_CLEAN_prng_init>:
22748: 715d addi sp,sp,-80
2274a: e0a2 sd s0,64(sp)
2274c: 862e mv a2,a1
2274e: 842a mv s0,a0
22750: 03800593 li a1,56
22754: 0028 addi a0,sp,8
22756: e486 sd ra,72(sp)
22758: 9c3ef0ef jal 1211a <shake256_inc_squeeze>
2275c: 8ec18713 addi a4,gp,-1812 # 390ec <__TMC_END__+0x4>
22760: 8ed18793 addi a5,gp,-1811 # 390ed <__TMC_END__+0x5>
22764: cc627057 vsetivli zero,4,e8,mf4,ta,ma
22768: 02b70187 vlm.v v3,(a4)
2276c: 02b78107 vlm.v v2,(a5)
22770: 20840713 addi a4,s0,520
22774: 003c addi a5,sp,8
22776: 23840513 addi a0,s0,568
2277a: 00478593 addi a1,a5,4
2277e: 02058087 vle8.v v1,(a1)
22782: 9e303057 vmv1r.v v0,v3
22786: 02078207 vle8.v v4,(a5)
2278a: 00878613 addi a2,a5,8
2278e: 02060407 vle8.v v8,(a2)
22792: 00c78693 addi a3,a5,12
22796: 5e1025d7 vcompress.vm v11,v1,v0
2279a: 9e203057 vmv1r.v v0,v2
2279e: 02068387 vle8.v v7,(a3)
227a2: 07c1 addi a5,a5,16
227a4: 5e102557 vcompress.vm v10,v1,v0
227a8: 9e303057 vmv1r.v v0,v3
227ac: 5e402357 vcompress.vm v6,v4,v0
227b0: 9e203057 vmv1r.v v0,v2
227b4: 3ab13357 vslideup.vi v6,v11,2
227b8: 5e4020d7 vcompress.vm v1,v4,v0
227bc: 5e802257 vcompress.vm v4,v8,v0
227c0: 9e303057 vmv1r.v v0,v3
227c4: 3aa130d7 vslideup.vi v1,v10,2
227c8: 5e8022d7 vcompress.vm v5,v8,v0
227cc: 9e203057 vmv1r.v v0,v2
227d0: 5e7024d7 vcompress.vm v9,v7,v0
227d4: 9e303057 vmv1r.v v0,v3
227d8: 3a913257 vslideup.vi v4,v9,2
227dc: 5e702457 vcompress.vm v8,v7,v0
227e0: 5e1024d7 vcompress.vm v9,v1,v0
227e4: 5e4023d7 vcompress.vm v7,v4,v0
227e8: 3a8132d7 vslideup.vi v5,v8,2
227ec: 5e602457 vcompress.vm v8,v6,v0
227f0: 3a7134d7 vslideup.vi v9,v7,2
227f4: 5e502557 vcompress.vm v10,v5,v0
227f8: 9e203057 vmv1r.v v0,v2
227fc: 3aa13457 vslideup.vi v8,v10,2
22800: 5e6023d7 vcompress.vm v7,v6,v0
22804: 5e502557 vcompress.vm v10,v5,v0
22808: 5e102357 vcompress.vm v6,v1,v0
2280c: 0d007057 vsetvli zero,zero,e32,m1,ta,ma
22810: 4a9220d7 vzext.vf4 v1,v9
22814: 0c607057 vsetvli zero,zero,e8,mf4,ta,ma
22818: 5e4024d7 vcompress.vm v9,v4,v0
2281c: 0d007057 vsetvli zero,zero,e32,m1,ta,ma
22820: 4a8222d7 vzext.vf4 v5,v8
22824: 0c607057 vsetvli zero,zero,e8,mf4,ta,ma
22828: 3aa133d7 vslideup.vi v7,v10,2
2282c: 0d007057 vsetvli zero,zero,e32,m1,ta,ma
22830: 961830d7 vsll.vi v1,v1,16
22834: 0c607057 vsetvli zero,zero,e8,mf4,ta,ma
22838: 3a913357 vslideup.vi v6,v9,2
2283c: 0d007057 vsetvli zero,zero,e32,m1,ta,ma
22840: 965c32d7 vsll.vi v5,v5,24
22844: 0cf07057 vsetvli zero,zero,e16,mf2,ta,ma
22848: 4a732257 vzext.vf2 v4,v7
2284c: 0d007057 vsetvli zero,zero,e32,m1,ta,ma
22850: 2a1280d7 vor.vv v1,v1,v5
22854: 4a6223d7 vzext.vf4 v7,v6
22858: 0cf07057 vsetvli zero,zero,e16,mf2,ta,ma
2285c: 96443257 vsll.vi v4,v4,8
22860: 0d007057 vsetvli zero,zero,e32,m1,ta,ma
22864: 2a1380d7 vor.vv v1,v1,v7
22868: 4a4322d7 vzext.vf2 v5,v4
2286c: 2a1280d7 vor.vv v1,v1,v5
22870: 020760a7 vse32.v v1,(a4)
22874: 0741 addi a4,a4,16
22876: 00e50563 beq a0,a4,22880 <PQCLEAN_FALCON512_CLEAN_prng_init+0x138>
2287a: 0c607057 vsetvli zero,zero,e8,mf4,ta,ma
2287e: bdf5 j 2277a <PQCLEAN_FALCON512_CLEAN_prng_init+0x32>
22880: 03914603 lbu a2,57(sp)
22884: 03a14783 lbu a5,58(sp)
22888: 03d14683 lbu a3,61(sp)
2288c: 03e14583 lbu a1,62(sp)
22890: 03814803 lbu a6,56(sp)
22894: 03b14703 lbu a4,59(sp)
22898: 0107979b slliw a5,a5,0x10
2289c: 0086161b slliw a2,a2,0x8
228a0: 03c14503 lbu a0,60(sp)
228a4: 8e5d or a2,a2,a5
228a6: 03f14783 lbu a5,63(sp)
228aa: 0105959b slliw a1,a1,0x10
228ae: 0086969b slliw a3,a3,0x8
228b2: 01066633 or a2,a2,a6
228b6: 8ecd or a3,a3,a1
228b8: 0187171b slliw a4,a4,0x18
228bc: 8ec9 or a3,a3,a0
228be: 8f51 or a4,a4,a2
228c0: 0187979b slliw a5,a5,0x18
228c4: 8fd5 or a5,a5,a3
228c6: 1702 slli a4,a4,0x20
228c8: 9301 srli a4,a4,0x20
228ca: 1782 slli a5,a5,0x20
228cc: 8fd9 or a5,a5,a4
228ce: 22f43c23 sd a5,568(s0)
228d2: 8522 mv a0,s0
228d4: 969ff0ef jal 2223c <PQCLEAN_FALCON512_CLEAN_prng_refill>
228d8: 60a6 ld ra,72(sp)
228da: 6406 ld s0,64(sp)
228dc: 6161 addi sp,sp,80
228de: 8082 ret