AcademySoftwareFoundation · kpchoi · Jun 9, 2025 · Jun 4, 2025 · Jun 4, 2025 · Jun 4, 2025
diff --git a/src/avx/oapv_tq_avx.c b/src/avx/oapv_tq_avx.c
@@ -95,10 +95,95 @@ static void oapv_tx_part_avx(s16 *src, s16 *dst, int shift, int line)
     _mm_store_si128((__m128i *)(dst + 6 * line), _mm256_castsi256_si128(d1));
     _mm_store_si128((__m128i *)(dst + 7 * line), _mm256_extracti128_si256(d1, 1));
 }
+const oapv_fn_tx_part_t oapv_tbl_fn_txb_part_avx[2] =
+{
+    oapv_tx_part_avx,
+        NULL
+};
+
+static void oapv_tx_avx(s16 *src, int shift1, int shift2, int line)
+{
+    __m256i v0, v1, v2, v3, v4, v5, v6, v7;
+    __m256i d0, d1, d2, d3, d4, d5;
+    __m256i coeff[8];
+    coeff[0] = _mm256_set1_epi16(64);
+    coeff[1] = _mm256_set_epi16(64, -64, -64, 64, 64, -64, -64, 64, 64, -64, -64, 64, 64, -64, -64, 64);
+    coeff[2] = _mm256_set_epi16(84, 35, -35, -84, -84, -35, 35, 84, 84, 35, -35, -84, -84, -35, 35, 84);
+    coeff[3] = _mm256_set_epi16(35, -84, 84, -35, -35, 84, -84, 35, 35, -84, 84, -35, -35, 84, -84, 35);
+    coeff[4] = _mm256_set_epi16(-89, -75, -50, -18, 18, 50, 75, 89, -89, -75, -50, -18, 18, 50, 75, 89);
+    coeff[5] = _mm256_set_epi16(-75, 18, 89, 50, -50, -89, -18, 75, -75, 18, 89, 50, -50, -89, -18, 75);
+    coeff[6] = _mm256_set_epi16(-50, 89, -18, -75, 75, 18, -89, 50, -50, 89, -18, -75, 75, 18, -89, 50);
+    coeff[7] = _mm256_set_epi16(-18, 50, -75, 89, -89, 75, -50, 18, -18, 50, -75, 89, -89, 75, -50, 18);
+    __m256i add1 = _mm256_set1_epi32(1 << (shift1 - 1));
+    __m256i add2 = _mm256_set1_epi32(1 << (shift2 - 1));
+
+    __m256i s0, s1, s2, s3;
+
+    s0 = _mm256_loadu2_m128i((const __m128i *)&src[32], (const __m128i *)&src[0]);
+    s1 = _mm256_loadu2_m128i((const __m128i *)&src[40], (const __m128i *)&src[8]);
+    s2 = _mm256_loadu2_m128i((const __m128i *)&src[48], (const __m128i *)&src[16]);
+    s3 = _mm256_loadu2_m128i((const __m128i *)&src[56], (const __m128i *)&src[24]);
+
+    CALCU_2x8(coeff[0], coeff[4], d0, d1);
+    CALCU_2x8(coeff[2], coeff[5], d2, d3);
+    CALCU_2x8_ADD_SHIFT(d0, d1, d2, d3, add1, shift1);
+
+    d0 = _mm256_packs_epi32(d0, d1);
+    d1 = _mm256_packs_epi32(d2, d3);
+
+    d0 = _mm256_permute4x64_epi64(d0, 0xd8);
+    d1 = _mm256_permute4x64_epi64(d1, 0xd8);
+
+    CALCU_2x8(coeff[1], coeff[6], d2, d3);
+    CALCU_2x8(coeff[3], coeff[7], d4, d5);
+    CALCU_2x8_ADD_SHIFT(d2, d3, d4, d5, add1, shift1);
+
+    d2 = _mm256_packs_epi32(d2, d3);
+    d3 = _mm256_packs_epi32(d4, d5);
+
+    d2 = _mm256_permute4x64_epi64(d2, 0xd8);
+    d3 = _mm256_permute4x64_epi64(d3, 0xd8);
+
+
+    s0 = _mm256_setr_m128i(_mm256_castsi256_si128(d0), _mm256_castsi256_si128(d2));
+    s1 = _mm256_setr_m128i(_mm256_extracti128_si256(d0, 1), _mm256_extracti128_si256(d2, 1));
+    s2 = _mm256_setr_m128i(_mm256_castsi256_si128(d1), _mm256_castsi256_si128(d3));
+    s3 = _mm256_setr_m128i(_mm256_extracti128_si256(d1, 1), _mm256_extracti128_si256(d3, 1));
+
+    CALCU_2x8(coeff[0], coeff[4], d0, d1);
+    CALCU_2x8(coeff[2], coeff[5], d2, d3);
+    CALCU_2x8_ADD_SHIFT(d0, d1, d2, d3, add2, shift2)
+
+    d0 = _mm256_packs_epi32(d0, d1);
+    d1 = _mm256_packs_epi32(d2, d3);
+
+    d0 = _mm256_permute4x64_epi64(d0, 0xd8);
+    d1 = _mm256_permute4x64_epi64(d1, 0xd8);
+
+    _mm_store_si128((__m128i *)src, _mm256_castsi256_si128(d0));
+    _mm_store_si128((__m128i *)(src + 1 * line), _mm256_extracti128_si256(d0, 1));
+    _mm_store_si128((__m128i *)(src + 2 * line), _mm256_castsi256_si128(d1));
+    _mm_store_si128((__m128i *)(src + 3 * line), _mm256_extracti128_si256(d1, 1));
+
+    CALCU_2x8(coeff[1], coeff[6], d0, d1);
+    CALCU_2x8(coeff[3], coeff[7], d2, d3);
+    CALCU_2x8_ADD_SHIFT(d0, d1, d2, d3, add2, shift2);
+
+    d0 = _mm256_packs_epi32(d0, d1);
+    d1 = _mm256_packs_epi32(d2, d3);
+
+    d0 = _mm256_permute4x64_epi64(d0, 0xd8);
+    d1 = _mm256_permute4x64_epi64(d1, 0xd8);
+
+    _mm_store_si128((__m128i *)(src + 4 * line), _mm256_castsi256_si128(d0));
+    _mm_store_si128((__m128i *)(src + 5 * line), _mm256_extracti128_si256(d0, 1));
+    _mm_store_si128((__m128i *)(src + 6 * line), _mm256_castsi256_si128(d1));
+    _mm_store_si128((__m128i *)(src + 7 * line), _mm256_extracti128_si256(d1, 1));
+}
 
 const oapv_fn_tx_t oapv_tbl_fn_txb_avx[2] =
 {
-    oapv_tx_part_avx,
+    oapv_tx_avx,
         NULL
 };
 

diff --git a/src/avx/oapv_tq_avx.h b/src/avx/oapv_tq_avx.h
@@ -129,6 +129,7 @@
 
 
 #if X86_SSE
+extern const oapv_fn_tx_part_t oapv_tbl_fn_txb_part_avx[2];
 extern const oapv_fn_tx_t oapv_tbl_fn_txb_avx[2];
 extern const oapv_fn_quant_t oapv_tbl_fn_quant_avx[2];
 extern const oapv_fn_itx_part_t oapv_tbl_fn_itx_part_avx[2];

diff --git a/src/neon/oapv_tq_neon.c b/src/neon/oapv_tq_neon.c
@@ -52,7 +52,7 @@ const s32 oapv_coeff[8][4] =
     high = vmulq_s32(part2, coeff); \
     res = vcombine_s32(vpadd_s32(vget_low_s32(low), vget_high_s32(low)), vpadd_s32(vget_low_s32(high), vget_high_s32(high))); \
 
-static void oapv_tx_pb8b_neon(s16 *src, s16 *dst, const int shift, int line)
+static void oapv_tx_pb8b_part_neon(s16 *src, s16 *dst, const int shift, int line)
 {
     s16 i;
     s16 *tempSrc = src;
@@ -186,6 +186,13 @@ static void oapv_tx_pb8b_neon(s16 *src, s16 *dst, const int shift, int line)
     }
 }
 
+static void oapv_tx_pb8b_neon(s16 *src, const int shift1, const int shift2, int line)
+{
+    ALIGNED_16(s16 dst[OAPV_BLK_D]);
+    oapv_tx_pb8b_part_neon(src, dst, shift1, line);
+    oapv_tx_pb8b_part_neon(dst, src, shift2, line);
+}
+
 const oapv_fn_tx_t oapv_tbl_fn_txb_neon[2] =
     {
         oapv_tx_pb8b_neon,

diff --git a/src/oapv.c b/src/oapv.c
@@ -1209,6 +1209,7 @@ static int enc_platform_init(oapve_ctx_t *ctx)
     ctx->fn_itx = oapv_tbl_fn_itx;
     ctx->fn_itx_adj = oapv_tbl_fn_itx_adj;
     ctx->fn_txb = oapv_tbl_fn_tx;
+    ctx->fn_txb_part = oapv_tbl_fn_tx_part;
     ctx->fn_quant = oapv_tbl_fn_quant;
     ctx->fn_dquant = oapv_tbl_fn_dquant;
     ctx->fn_had8x8 = oapv_dc_removed_had8x8;
@@ -1227,6 +1228,7 @@ static int enc_platform_init(oapve_ctx_t *ctx)
         ctx->fn_itx = oapv_tbl_fn_itx_avx;
         ctx->fn_itx_adj = oapv_tbl_fn_itx_adj_avx;
         ctx->fn_txb = oapv_tbl_fn_txb_avx;
+        ctx->fn_txb_part = oapv_tbl_fn_txb_part_avx;
         ctx->fn_quant = oapv_tbl_fn_quant_avx;
         ctx->fn_dquant = oapv_tbl_fn_dquant_avx;
         ctx->fn_had8x8 = oapv_dc_removed_had8x8_sse;

diff --git a/src/oapv_def.h b/src/oapv_def.h
@@ -174,7 +174,8 @@ typedef struct oapve_core oapve_core_t;
  *****************************************************************************/
 typedef void (*oapv_fn_itx_part_t)(s16 *coef, s16 *t, int shift, int line);
 typedef void (*oapv_fn_itx_t)(s16 *coef, int shift1, int shift2, int line);
-typedef void (*oapv_fn_tx_t)(s16 *coef, s16 *t, int shift, int line);
+typedef void (*oapv_fn_tx_part_t)(s16 *coef, s16 *t, int shift, int line);
+typedef void (*oapv_fn_tx_t)(s16 *coef, int shift1, int shift2, int line);
 typedef void (*oapv_fn_itx_adj_t)(int *src, int *dst, int itrans_diff_idx, int diff_step, int shift);
 typedef int (*oapv_fn_quant_t)(s16 *coef, u8 qp, int q_matrix[OAPV_BLK_D], int log2_w, int log2_h, int bit_depth, int deadzone_offset);
 typedef void (*oapv_fn_dquant_t)(s16 *coef, s16 q_matrix[OAPV_BLK_D], int log2_w, int log2_h, s8 shift);
@@ -295,6 +296,7 @@ struct oapve_ctx {
     const oapv_fn_itx_t      *fn_itx;
     const oapv_fn_itx_adj_t  *fn_itx_adj;
     const oapv_fn_tx_t       *fn_txb;
+    const oapv_fn_tx_part_t  *fn_txb_part;
     const oapv_fn_quant_t    *fn_quant;
     const oapv_fn_dquant_t   *fn_dquant;
     const oapv_fn_sad_t      *fn_sad;

diff --git a/src/oapv_tq.c b/src/oapv_tq.c
@@ -70,8 +70,21 @@ static void oapv_tx_part(s16 *src, s16 *dst, int shift, int line)
     }
 }
 
-const oapv_fn_tx_t oapv_tbl_fn_tx[2] = {
+const oapv_fn_tx_part_t oapv_tbl_fn_tx_part[2] =
+{
     oapv_tx_part,
+        NULL
+};
+
+static void oapv_tx(s16 *src, int shift1, int shift2, int line)
+{
+    ALIGNED_16(s16 dst[OAPV_BLK_D]);
+    oapv_tx_part(src, dst, shift1, line);
+    oapv_tx_part(dst, src, shift2, line);
+}
+
+const oapv_fn_tx_t oapv_tbl_fn_tx[2] = {
+    oapv_tx,
     NULL
 };
 
@@ -90,9 +103,7 @@ void oapv_trans(oapve_ctx_t *ctx, s16 *coef, int log2_w, int log2_h, int bit_dep
     int shift1 = get_transform_shift(log2_w, 0, bit_depth);
     int shift2 = get_transform_shift(log2_h, 1, bit_depth);
 
-    ALIGNED_16(s16 tb[OAPV_BLK_D]);
-    (ctx->fn_txb)[0](coef, tb, shift1, 1 << log2_h);
-    (ctx->fn_txb)[0](tb, coef, shift2, 1 << log2_w);
+    (ctx->fn_txb)[0](coef, shift1, shift2, 1 << log2_h);
 }
 
 static int oapv_quant(s16 *coef, u8 qp, int q_matrix[OAPV_BLK_D], int log2_w, int log2_h, int bit_depth, int deadzone_offset)

diff --git a/src/oapv_tq.h b/src/oapv_tq.h
@@ -39,9 +39,10 @@
 #if ENABLE_ENCODER
 ///////////////////////////////////////////////////////////////////////////////
 
-extern const oapv_fn_tx_t    oapv_tbl_fn_tx[2];
-extern const oapv_fn_quant_t oapv_tbl_fn_quant[2];
-extern const int             oapv_quant_scale[6];
+extern const oapv_fn_tx_t       oapv_tbl_fn_tx[2];
+extern const oapv_fn_tx_part_t  oapv_tbl_fn_tx_part[2];
+extern const oapv_fn_quant_t    oapv_tbl_fn_quant[2];
+extern const int                oapv_quant_scale[6];
 
 void oapv_trans(oapve_ctx_t *ctx, s16 *coef, int log2_w, int log2_h, int bit_depth);
 void oapv_itx_get_wo_sft(s16 *src, s16 *dst, s32 *dst32, int shift, int line);