Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
56 changes: 41 additions & 15 deletions src/avx/oapv_tq_avx.c
Original file line number Diff line number Diff line change
Expand Up @@ -43,10 +43,10 @@
_mm256_set_m128i(_mm_loadu_si128(hiaddr), _mm_loadu_si128(loaddr))
#endif // !_mm256_loadu2_m128i

static void oapv_tx_part_avx(s16 *src, s16 *dst, int shift, int line)
static void oapv_tx_avx(s16 *src, int shift1, int shift2, int line)
{
__m256i v0, v1, v2, v3, v4, v5, v6, v7;
__m256i d0, d1, d2, d3;
__m256i d0, d1, d2, d3, d4, d5;
__m256i coeff[8];
coeff[0] = _mm256_set1_epi16(64);
coeff[1] = _mm256_set_epi16(64, -64, -64, 64, 64, -64, -64, 64, 64, -64, -64, 64, 64, -64, -64, 64);
Expand All @@ -56,7 +56,8 @@ static void oapv_tx_part_avx(s16 *src, s16 *dst, int shift, int line)
coeff[5] = _mm256_set_epi16(-75, 18, 89, 50, -50, -89, -18, 75, -75, 18, 89, 50, -50, -89, -18, 75);
coeff[6] = _mm256_set_epi16(-50, 89, -18, -75, 75, 18, -89, 50, -50, 89, -18, -75, 75, 18, -89, 50);
coeff[7] = _mm256_set_epi16(-18, 50, -75, 89, -89, 75, -50, 18, -18, 50, -75, 89, -89, 75, -50, 18);
__m256i add = _mm256_set1_epi32(1 << (shift - 1));
__m256i add1 = _mm256_set1_epi32(1 << (shift1 - 1));
__m256i add2 = _mm256_set1_epi32(1 << (shift2 - 1));

__m256i s0, s1, s2, s3;

Expand All @@ -67,38 +68,63 @@ static void oapv_tx_part_avx(s16 *src, s16 *dst, int shift, int line)

CALCU_2x8(coeff[0], coeff[4], d0, d1);
CALCU_2x8(coeff[2], coeff[5], d2, d3);
CALCU_2x8_ADD_SHIFT(d0, d1, d2, d3, add, shift)
CALCU_2x8_ADD_SHIFT(d0, d1, d2, d3, add1, shift1);

d0 = _mm256_packs_epi32(d0, d1);
d0 = _mm256_packs_epi32(d0, d1);
d1 = _mm256_packs_epi32(d2, d3);

d0 = _mm256_permute4x64_epi64(d0, 0xd8);
d1 = _mm256_permute4x64_epi64(d1, 0xd8);

CALCU_2x8(coeff[1], coeff[6], d2, d3);
CALCU_2x8(coeff[3], coeff[7], d4, d5);
CALCU_2x8_ADD_SHIFT(d2, d3, d4, d5, add1, shift1);

d2 = _mm256_packs_epi32(d2, d3);
d3 = _mm256_packs_epi32(d4, d5);

d2 = _mm256_permute4x64_epi64(d2, 0xd8);
d3 = _mm256_permute4x64_epi64(d3, 0xd8);

s0 = _mm256_setr_m128i(_mm256_castsi256_si128(d0), _mm256_castsi256_si128(d2));
s1 = _mm256_setr_m128i(_mm256_extracti128_si256(d0, 1), _mm256_extracti128_si256(d2, 1));
s2 = _mm256_setr_m128i(_mm256_castsi256_si128(d1), _mm256_castsi256_si128(d3));
s3 = _mm256_setr_m128i(_mm256_extracti128_si256(d1, 1), _mm256_extracti128_si256(d3, 1));

CALCU_2x8(coeff[0], coeff[4], d0, d1);
CALCU_2x8(coeff[2], coeff[5], d2, d3);
CALCU_2x8_ADD_SHIFT(d0, d1, d2, d3, add2, shift2)

d0 = _mm256_packs_epi32(d0, d1);
d1 = _mm256_packs_epi32(d2, d3);

d0 = _mm256_permute4x64_epi64(d0, 0xd8);
d1 = _mm256_permute4x64_epi64(d1, 0xd8);

_mm_store_si128((__m128i *)dst, _mm256_castsi256_si128(d0));
_mm_store_si128((__m128i *)(dst + 1 * line), _mm256_extracti128_si256(d0, 1));
_mm_store_si128((__m128i *)(dst + 2 * line), _mm256_castsi256_si128(d1));
_mm_store_si128((__m128i *)(dst + 3 * line), _mm256_extracti128_si256(d1, 1));
_mm_store_si128((__m128i *)src, _mm256_castsi256_si128(d0));
_mm_store_si128((__m128i *)(src + 1 * line), _mm256_extracti128_si256(d0, 1));
_mm_store_si128((__m128i *)(src + 2 * line), _mm256_castsi256_si128(d1));
_mm_store_si128((__m128i *)(src + 3 * line), _mm256_extracti128_si256(d1, 1));

CALCU_2x8(coeff[1], coeff[6], d0, d1);
CALCU_2x8(coeff[3], coeff[7], d2, d3);
CALCU_2x8_ADD_SHIFT(d0, d1, d2, d3, add, shift);
CALCU_2x8_ADD_SHIFT(d0, d1, d2, d3, add2, shift2);

d0 = _mm256_packs_epi32(d0, d1);
d1 = _mm256_packs_epi32(d2, d3);

d0 = _mm256_permute4x64_epi64(d0, 0xd8);
d1 = _mm256_permute4x64_epi64(d1, 0xd8);

_mm_store_si128((__m128i *)(dst + 4 * line), _mm256_castsi256_si128(d0));
_mm_store_si128((__m128i *)(dst + 5 * line), _mm256_extracti128_si256(d0, 1));
_mm_store_si128((__m128i *)(dst + 6 * line), _mm256_castsi256_si128(d1));
_mm_store_si128((__m128i *)(dst + 7 * line), _mm256_extracti128_si256(d1, 1));
_mm_store_si128((__m128i *)(src + 4 * line), _mm256_castsi256_si128(d0));
_mm_store_si128((__m128i *)(src + 5 * line), _mm256_extracti128_si256(d0, 1));
_mm_store_si128((__m128i *)(src + 6 * line), _mm256_castsi256_si128(d1));
_mm_store_si128((__m128i *)(src + 7 * line), _mm256_extracti128_si256(d1, 1));
}

const oapv_fn_tx_t oapv_tbl_fn_txb_avx[2] =
{
oapv_tx_part_avx,
oapv_tx_avx,
NULL
};

Expand Down
9 changes: 8 additions & 1 deletion src/neon/oapv_tq_neon.c
Original file line number Diff line number Diff line change
Expand Up @@ -52,7 +52,7 @@ const s32 oapv_coeff[8][4] =
high = vmulq_s32(part2, coeff); \
res = vcombine_s32(vpadd_s32(vget_low_s32(low), vget_high_s32(low)), vpadd_s32(vget_low_s32(high), vget_high_s32(high))); \

static void oapv_tx_pb8b_neon(s16 *src, s16 *dst, const int shift, int line)
static void oapv_tx_pb8b_part_neon(s16 *src, s16 *dst, const int shift, int line)
{
s16 i;
s16 *tempSrc = src;
Expand Down Expand Up @@ -186,6 +186,13 @@ static void oapv_tx_pb8b_neon(s16 *src, s16 *dst, const int shift, int line)
}
}

static void oapv_tx_pb8b_neon(s16 *src, const int shift1, const int shift2, int line)
{
ALIGNED_16(s16 dst[OAPV_BLK_D]);
oapv_tx_pb8b_part_neon(src, dst, shift1, line);
oapv_tx_pb8b_part_neon(dst, src, shift2, line);
}

const oapv_fn_tx_t oapv_tbl_fn_txb_neon[2] =
{
oapv_tx_pb8b_neon,
Expand Down
2 changes: 1 addition & 1 deletion src/oapv_def.h
Original file line number Diff line number Diff line change
Expand Up @@ -174,7 +174,7 @@ typedef struct oapve_core oapve_core_t;
*****************************************************************************/
typedef void (*oapv_fn_itx_part_t)(s16 *coef, s16 *t, int shift, int line);
typedef void (*oapv_fn_itx_t)(s16 *coef, int shift1, int shift2, int line);
typedef void (*oapv_fn_tx_t)(s16 *coef, s16 *t, int shift, int line);
typedef void (*oapv_fn_tx_t)(s16 *coef, int shift1, int shift2, int line);
typedef void (*oapv_fn_itx_adj_t)(int *src, int *dst, int itrans_diff_idx, int diff_step, int shift);
typedef int (*oapv_fn_quant_t)(s16 *coef, u8 qp, int q_matrix[OAPV_BLK_D], int log2_w, int log2_h, int bit_depth, int deadzone_offset);
typedef void (*oapv_fn_dquant_t)(s16 *coef, s16 q_matrix[OAPV_BLK_D], int log2_w, int log2_h, s8 shift);
Expand Down
13 changes: 9 additions & 4 deletions src/oapv_tq.c
Original file line number Diff line number Diff line change
Expand Up @@ -70,8 +70,15 @@ static void oapv_tx_part(s16 *src, s16 *dst, int shift, int line)
}
}

static void oapv_tx(s16 *src, int shift1, int shift2, int line)
{
ALIGNED_16(s16 dst[OAPV_BLK_D]);
oapv_tx_part(src, dst, shift1, line);
oapv_tx_part(dst, src, shift2, line);
}

const oapv_fn_tx_t oapv_tbl_fn_tx[2] = {
oapv_tx_part,
oapv_tx,
NULL
};

Expand All @@ -90,9 +97,7 @@ void oapv_trans(oapve_ctx_t *ctx, s16 *coef, int log2_w, int log2_h, int bit_dep
int shift1 = get_transform_shift(log2_w, 0, bit_depth);
int shift2 = get_transform_shift(log2_h, 1, bit_depth);

ALIGNED_16(s16 tb[OAPV_BLK_D]);
(ctx->fn_txb)[0](coef, tb, shift1, 1 << log2_h);
(ctx->fn_txb)[0](tb, coef, shift2, 1 << log2_w);
(ctx->fn_txb)[0](coef, shift1, shift2, 1 << log2_h);
}

static int oapv_quant(s16 *coef, u8 qp, int q_matrix[OAPV_BLK_D], int log2_w, int log2_h, int bit_depth, int deadzone_offset)
Expand Down