@@ -349,6 +349,7 @@ void mlk_rv64v_poly_ntt(int16_t *r)
349
349
u0 = __riscv_vadd_vv_i16m1(u0, u1, vl); \
350
350
u0 = fq_csub(u0, vl); \
351
351
u1 = fq_mul_vx(ut, uc, vl); \
352
+ u1 = fq_cadd(u1, vl); \
352
353
}
353
354
354
355
#define MLK_RVV_BFLY_RV (u0 , u1 , ut , uc , vl ) \
@@ -357,6 +358,7 @@ void mlk_rv64v_poly_ntt(int16_t *r)
357
358
u0 = __riscv_vadd_vv_i16m1(u0, u1, vl); \
358
359
u0 = fq_csub(u0, vl); \
359
360
u1 = fq_mul_vv(ut, uc, vl); \
361
+ u1 = fq_cadd(u1, vl); \
360
362
}
361
363
362
364
static vint16m2_t mlk_rv64v_intt2 (vint16m2_t vp , vint16m1_t cz )
@@ -387,9 +389,10 @@ static vint16m2_t mlk_rv64v_intt2(vint16m2_t vp, vint16m1_t cz)
387
389
t0 = __riscv_vget_v_i16m2_i16m1 (vp , 0 );
388
390
t1 = __riscv_vget_v_i16m2_i16m1 (vp , 1 );
389
391
390
- /* initial reduction due to lack of input assumptions on INTT */
391
- t0 = fq_mul_vx (t0 , MLK_RVV_MONT_R1 , vl );
392
- t1 = fq_mul_vx (t1 , MLK_RVV_MONT_R1 , vl );
392
+ /* move to positive range [0, q-1] for the reverse transform */
393
+ t0 = fq_mulq_vx (t0 , MLK_RVV_MONT_R1 , vl );
394
+ t1 = fq_mulq_vx (t1 , MLK_RVV_MONT_R1 , vl );
395
+
393
396
c0 = __riscv_vrgather_vv_i16m1 (cz , cs2 , vl );
394
397
MLK_RVV_BFLY_RV (t0 , t1 , vt , c0 , vl );
395
398
@@ -415,9 +418,6 @@ static vint16m2_t mlk_rv64v_intt2(vint16m2_t vp, vint16m1_t cz)
415
418
t0 = __riscv_vget_v_i16m2_i16m1 (vp , 0 );
416
419
t1 = __riscv_vget_v_i16m2_i16m1 (vp , 1 );
417
420
418
- /* normalize first element */
419
- t0 = fq_mulq_vx (t0 , MLK_RVV_MONT_R1 , vl );
420
-
421
421
vp = __riscv_vcreate_v_i16m1_i16m2 (t0 , t1 );
422
422
423
423
return vp ;
0 commit comments