Skip to content
Merged
Show file tree
Hide file tree
Changes from 4 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
87 changes: 86 additions & 1 deletion src/avx/oapv_tq_avx.c
Original file line number Diff line number Diff line change
Expand Up @@ -95,10 +95,95 @@ static void oapv_tx_part_avx(s16 *src, s16 *dst, int shift, int line)
_mm_store_si128((__m128i *)(dst + 6 * line), _mm256_castsi256_si128(d1));
_mm_store_si128((__m128i *)(dst + 7 * line), _mm256_extracti128_si256(d1, 1));
}
const oapv_fn_tx_part_t oapv_tbl_fn_txb_part_avx[2] =
{
oapv_tx_part_avx,
NULL
};

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I don't know whether this structure is actually used.
fn_tx_part_t variable is not called anwhere...

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yes it is unused. Thus removing it.

static void oapv_tx_avx(s16 *src, int shift1, int shift2, int line)
{
__m256i v0, v1, v2, v3, v4, v5, v6, v7;
__m256i d0, d1, d2, d3, d4, d5;
__m256i coeff[8];
coeff[0] = _mm256_set1_epi16(64);
coeff[1] = _mm256_set_epi16(64, -64, -64, 64, 64, -64, -64, 64, 64, -64, -64, 64, 64, -64, -64, 64);
coeff[2] = _mm256_set_epi16(84, 35, -35, -84, -84, -35, 35, 84, 84, 35, -35, -84, -84, -35, 35, 84);
coeff[3] = _mm256_set_epi16(35, -84, 84, -35, -35, 84, -84, 35, 35, -84, 84, -35, -35, 84, -84, 35);
coeff[4] = _mm256_set_epi16(-89, -75, -50, -18, 18, 50, 75, 89, -89, -75, -50, -18, 18, 50, 75, 89);
coeff[5] = _mm256_set_epi16(-75, 18, 89, 50, -50, -89, -18, 75, -75, 18, 89, 50, -50, -89, -18, 75);
coeff[6] = _mm256_set_epi16(-50, 89, -18, -75, 75, 18, -89, 50, -50, 89, -18, -75, 75, 18, -89, 50);
coeff[7] = _mm256_set_epi16(-18, 50, -75, 89, -89, 75, -50, 18, -18, 50, -75, 89, -89, 75, -50, 18);
__m256i add1 = _mm256_set1_epi32(1 << (shift1 - 1));
__m256i add2 = _mm256_set1_epi32(1 << (shift2 - 1));

__m256i s0, s1, s2, s3;

s0 = _mm256_loadu2_m128i((const __m128i *)&src[32], (const __m128i *)&src[0]);
s1 = _mm256_loadu2_m128i((const __m128i *)&src[40], (const __m128i *)&src[8]);
s2 = _mm256_loadu2_m128i((const __m128i *)&src[48], (const __m128i *)&src[16]);
s3 = _mm256_loadu2_m128i((const __m128i *)&src[56], (const __m128i *)&src[24]);

CALCU_2x8(coeff[0], coeff[4], d0, d1);
CALCU_2x8(coeff[2], coeff[5], d2, d3);
CALCU_2x8_ADD_SHIFT(d0, d1, d2, d3, add1, shift1);

d0 = _mm256_packs_epi32(d0, d1);
d1 = _mm256_packs_epi32(d2, d3);

d0 = _mm256_permute4x64_epi64(d0, 0xd8);
d1 = _mm256_permute4x64_epi64(d1, 0xd8);

CALCU_2x8(coeff[1], coeff[6], d2, d3);
CALCU_2x8(coeff[3], coeff[7], d4, d5);
CALCU_2x8_ADD_SHIFT(d2, d3, d4, d5, add1, shift1);

d2 = _mm256_packs_epi32(d2, d3);
d3 = _mm256_packs_epi32(d4, d5);

d2 = _mm256_permute4x64_epi64(d2, 0xd8);
d3 = _mm256_permute4x64_epi64(d3, 0xd8);


s0 = _mm256_setr_m128i(_mm256_castsi256_si128(d0), _mm256_castsi256_si128(d2));
s1 = _mm256_setr_m128i(_mm256_extracti128_si256(d0, 1), _mm256_extracti128_si256(d2, 1));
s2 = _mm256_setr_m128i(_mm256_castsi256_si128(d1), _mm256_castsi256_si128(d3));
s3 = _mm256_setr_m128i(_mm256_extracti128_si256(d1, 1), _mm256_extracti128_si256(d3, 1));

CALCU_2x8(coeff[0], coeff[4], d0, d1);
CALCU_2x8(coeff[2], coeff[5], d2, d3);
CALCU_2x8_ADD_SHIFT(d0, d1, d2, d3, add2, shift2)

d0 = _mm256_packs_epi32(d0, d1);
d1 = _mm256_packs_epi32(d2, d3);

d0 = _mm256_permute4x64_epi64(d0, 0xd8);
d1 = _mm256_permute4x64_epi64(d1, 0xd8);

_mm_store_si128((__m128i *)src, _mm256_castsi256_si128(d0));
_mm_store_si128((__m128i *)(src + 1 * line), _mm256_extracti128_si256(d0, 1));
_mm_store_si128((__m128i *)(src + 2 * line), _mm256_castsi256_si128(d1));
_mm_store_si128((__m128i *)(src + 3 * line), _mm256_extracti128_si256(d1, 1));

CALCU_2x8(coeff[1], coeff[6], d0, d1);
CALCU_2x8(coeff[3], coeff[7], d2, d3);
CALCU_2x8_ADD_SHIFT(d0, d1, d2, d3, add2, shift2);

d0 = _mm256_packs_epi32(d0, d1);
d1 = _mm256_packs_epi32(d2, d3);

d0 = _mm256_permute4x64_epi64(d0, 0xd8);
d1 = _mm256_permute4x64_epi64(d1, 0xd8);

_mm_store_si128((__m128i *)(src + 4 * line), _mm256_castsi256_si128(d0));
_mm_store_si128((__m128i *)(src + 5 * line), _mm256_extracti128_si256(d0, 1));
_mm_store_si128((__m128i *)(src + 6 * line), _mm256_castsi256_si128(d1));
_mm_store_si128((__m128i *)(src + 7 * line), _mm256_extracti128_si256(d1, 1));
}

const oapv_fn_tx_t oapv_tbl_fn_txb_avx[2] =
{
oapv_tx_part_avx,
oapv_tx_avx,
NULL
};

Expand Down
1 change: 1 addition & 0 deletions src/avx/oapv_tq_avx.h
Original file line number Diff line number Diff line change
Expand Up @@ -129,6 +129,7 @@


#if X86_SSE
extern const oapv_fn_tx_part_t oapv_tbl_fn_txb_part_avx[2];
extern const oapv_fn_tx_t oapv_tbl_fn_txb_avx[2];
extern const oapv_fn_quant_t oapv_tbl_fn_quant_avx[2];
extern const oapv_fn_itx_part_t oapv_tbl_fn_itx_part_avx[2];
Expand Down
9 changes: 8 additions & 1 deletion src/neon/oapv_tq_neon.c
Original file line number Diff line number Diff line change
Expand Up @@ -52,7 +52,7 @@ const s32 oapv_coeff[8][4] =
high = vmulq_s32(part2, coeff); \
res = vcombine_s32(vpadd_s32(vget_low_s32(low), vget_high_s32(low)), vpadd_s32(vget_low_s32(high), vget_high_s32(high))); \

static void oapv_tx_pb8b_neon(s16 *src, s16 *dst, const int shift, int line)
static void oapv_tx_pb8b_part_neon(s16 *src, s16 *dst, const int shift, int line)
{
s16 i;
s16 *tempSrc = src;
Expand Down Expand Up @@ -186,6 +186,13 @@ static void oapv_tx_pb8b_neon(s16 *src, s16 *dst, const int shift, int line)
}
}

static void oapv_tx_pb8b_neon(s16 *src, const int shift1, const int shift2, int line)
{
ALIGNED_16(s16 dst[OAPV_BLK_D]);
oapv_tx_pb8b_part_neon(src, dst, shift1, line);
oapv_tx_pb8b_part_neon(dst, src, shift2, line);
}

const oapv_fn_tx_t oapv_tbl_fn_txb_neon[2] =
{
oapv_tx_pb8b_neon,
Expand Down
2 changes: 2 additions & 0 deletions src/oapv.c
Original file line number Diff line number Diff line change
Expand Up @@ -1209,6 +1209,7 @@ static int enc_platform_init(oapve_ctx_t *ctx)
ctx->fn_itx = oapv_tbl_fn_itx;
ctx->fn_itx_adj = oapv_tbl_fn_itx_adj;
ctx->fn_txb = oapv_tbl_fn_tx;
ctx->fn_txb_part = oapv_tbl_fn_tx_part;
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

ctx->fn_txb_part doesn't seem to be called anywhere.
Is this required?

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Removed it in the latest commit.

ctx->fn_quant = oapv_tbl_fn_quant;
ctx->fn_dquant = oapv_tbl_fn_dquant;
ctx->fn_had8x8 = oapv_dc_removed_had8x8;
Expand All @@ -1227,6 +1228,7 @@ static int enc_platform_init(oapve_ctx_t *ctx)
ctx->fn_itx = oapv_tbl_fn_itx_avx;
ctx->fn_itx_adj = oapv_tbl_fn_itx_adj_avx;
ctx->fn_txb = oapv_tbl_fn_txb_avx;
ctx->fn_txb_part = oapv_tbl_fn_txb_part_avx;
ctx->fn_quant = oapv_tbl_fn_quant_avx;
ctx->fn_dquant = oapv_tbl_fn_dquant_avx;
ctx->fn_had8x8 = oapv_dc_removed_had8x8_sse;
Expand Down
4 changes: 3 additions & 1 deletion src/oapv_def.h
Original file line number Diff line number Diff line change
Expand Up @@ -174,7 +174,8 @@ typedef struct oapve_core oapve_core_t;
*****************************************************************************/
typedef void (*oapv_fn_itx_part_t)(s16 *coef, s16 *t, int shift, int line);
typedef void (*oapv_fn_itx_t)(s16 *coef, int shift1, int shift2, int line);
typedef void (*oapv_fn_tx_t)(s16 *coef, s16 *t, int shift, int line);
typedef void (*oapv_fn_tx_part_t)(s16 *coef, s16 *t, int shift, int line);
typedef void (*oapv_fn_tx_t)(s16 *coef, int shift1, int shift2, int line);
typedef void (*oapv_fn_itx_adj_t)(int *src, int *dst, int itrans_diff_idx, int diff_step, int shift);
typedef int (*oapv_fn_quant_t)(s16 *coef, u8 qp, int q_matrix[OAPV_BLK_D], int log2_w, int log2_h, int bit_depth, int deadzone_offset);
typedef void (*oapv_fn_dquant_t)(s16 *coef, s16 q_matrix[OAPV_BLK_D], int log2_w, int log2_h, s8 shift);
Expand Down Expand Up @@ -295,6 +296,7 @@ struct oapve_ctx {
const oapv_fn_itx_t *fn_itx;
const oapv_fn_itx_adj_t *fn_itx_adj;
const oapv_fn_tx_t *fn_txb;
const oapv_fn_tx_part_t *fn_txb_part;
const oapv_fn_quant_t *fn_quant;
const oapv_fn_dquant_t *fn_dquant;
const oapv_fn_sad_t *fn_sad;
Expand Down
19 changes: 15 additions & 4 deletions src/oapv_tq.c
Original file line number Diff line number Diff line change
Expand Up @@ -70,8 +70,21 @@ static void oapv_tx_part(s16 *src, s16 *dst, int shift, int line)
}
}

const oapv_fn_tx_t oapv_tbl_fn_tx[2] = {
const oapv_fn_tx_part_t oapv_tbl_fn_tx_part[2] =
{
oapv_tx_part,
NULL
};

static void oapv_tx(s16 *src, int shift1, int shift2, int line)
{
ALIGNED_16(s16 dst[OAPV_BLK_D]);
oapv_tx_part(src, dst, shift1, line);
oapv_tx_part(dst, src, shift2, line);
}

const oapv_fn_tx_t oapv_tbl_fn_tx[2] = {
oapv_tx,
NULL
};

Expand All @@ -90,9 +103,7 @@ void oapv_trans(oapve_ctx_t *ctx, s16 *coef, int log2_w, int log2_h, int bit_dep
int shift1 = get_transform_shift(log2_w, 0, bit_depth);
int shift2 = get_transform_shift(log2_h, 1, bit_depth);

ALIGNED_16(s16 tb[OAPV_BLK_D]);
(ctx->fn_txb)[0](coef, tb, shift1, 1 << log2_h);
(ctx->fn_txb)[0](tb, coef, shift2, 1 << log2_w);
(ctx->fn_txb)[0](coef, shift1, shift2, 1 << log2_h);
}

static int oapv_quant(s16 *coef, u8 qp, int q_matrix[OAPV_BLK_D], int log2_w, int log2_h, int bit_depth, int deadzone_offset)
Expand Down
7 changes: 4 additions & 3 deletions src/oapv_tq.h
Original file line number Diff line number Diff line change
Expand Up @@ -39,9 +39,10 @@
#if ENABLE_ENCODER
///////////////////////////////////////////////////////////////////////////////

extern const oapv_fn_tx_t oapv_tbl_fn_tx[2];
extern const oapv_fn_quant_t oapv_tbl_fn_quant[2];
extern const int oapv_quant_scale[6];
extern const oapv_fn_tx_t oapv_tbl_fn_tx[2];
extern const oapv_fn_tx_part_t oapv_tbl_fn_tx_part[2];
extern const oapv_fn_quant_t oapv_tbl_fn_quant[2];
extern const int oapv_quant_scale[6];

void oapv_trans(oapve_ctx_t *ctx, s16 *coef, int log2_w, int log2_h, int bit_depth);
void oapv_itx_get_wo_sft(s16 *src, s16 *dst, s32 *dst32, int shift, int line);
Expand Down