@@ -389,9 +389,9 @@ static vint16m2_t mlk_rv64v_intt2(vint16m2_t vp, vint16m1_t cz)
389
389
t0 = __riscv_vget_v_i16m2_i16m1 (vp , 0 );
390
390
t1 = __riscv_vget_v_i16m2_i16m1 (vp , 1 );
391
391
392
- /* move to positive range [0, q-1] for the reverse transform */
393
- t0 = fq_mulq_vx (t0 , MLK_RVV_MONT_R1 , vl );
394
- t1 = fq_mulq_vx (t1 , MLK_RVV_MONT_R1 , vl );
392
+ /* pre-scale and move to positive range [0, q-1] for inverse transform */
393
+ t0 = fq_mulq_vx (t0 , MLK_RVV_MONT_NR , vl );
394
+ t1 = fq_mulq_vx (t1 , MLK_RVV_MONT_NR , vl );
395
395
396
396
c0 = __riscv_vrgather_vv_i16m1 (cz , cs2 , vl );
397
397
MLK_RVV_BFLY_RV (t0 , t1 , vt , c0 , vl );
@@ -512,23 +512,6 @@ void mlk_rv64v_poly_invntt_tomont(int16_t *r)
512
512
MLK_RVV_BFLY_RX (v6 , ve , vt , izeta [0x01 ], vl );
513
513
MLK_RVV_BFLY_RX (v7 , vf , vt , izeta [0x01 ], vl );
514
514
515
- v0 = fq_mulq_vx (v0 , MLK_RVV_MONT_NR , vl );
516
- v1 = fq_mulq_vx (v1 , MLK_RVV_MONT_NR , vl );
517
- v2 = fq_mulq_vx (v2 , MLK_RVV_MONT_NR , vl );
518
- v3 = fq_mulq_vx (v3 , MLK_RVV_MONT_NR , vl );
519
- v4 = fq_mulq_vx (v4 , MLK_RVV_MONT_NR , vl );
520
- v5 = fq_mulq_vx (v5 , MLK_RVV_MONT_NR , vl );
521
- v6 = fq_mulq_vx (v6 , MLK_RVV_MONT_NR , vl );
522
- v7 = fq_mulq_vx (v7 , MLK_RVV_MONT_NR , vl );
523
- v8 = fq_mulq_vx (v8 , MLK_RVV_MONT_NR , vl );
524
- v9 = fq_mulq_vx (v9 , MLK_RVV_MONT_NR , vl );
525
- va = fq_mulq_vx (va , MLK_RVV_MONT_NR , vl );
526
- vb = fq_mulq_vx (vb , MLK_RVV_MONT_NR , vl );
527
- vc = fq_mulq_vx (vc , MLK_RVV_MONT_NR , vl );
528
- vd = fq_mulq_vx (vd , MLK_RVV_MONT_NR , vl );
529
- ve = fq_mulq_vx (ve , MLK_RVV_MONT_NR , vl );
530
- vf = fq_mulq_vx (vf , MLK_RVV_MONT_NR , vl );
531
-
532
515
__riscv_vse16_v_i16m1 (& r [0x00 ], v0 , vl );
533
516
__riscv_vse16_v_i16m1 (& r [0x10 ], v1 , vl );
534
517
__riscv_vse16_v_i16m1 (& r [0x20 ], v2 , vl );
0 commit comments