diff --git a/Source/Lib/Codec/EbEncDecProcess.c b/Source/Lib/Codec/EbEncDecProcess.c index 1031a1d..e219ab9 100644 --- a/Source/Lib/Codec/EbEncDecProcess.c +++ b/Source/Lib/Codec/EbEncDecProcess.c @@ -463,7 +463,7 @@ void perform_coding_loop( switch (tx_size) { case TX_32X32: if (!(is_encode_pass && context_ptr->skip_eob_zero_mode_ep && *eob == 0)) { - residual_kernel_func_ptr_array[(ASM_TYPES & PREAVX2_MASK) && 1][32 >> 3]( + residual_kernel_func_ptr_array[(ASM_TYPES & AVX2_MASK) && 1][32 >> 3]( input_buffer, input_stride, pred_buffer, @@ -510,7 +510,7 @@ void perform_coding_loop( } if (context_ptr->spatial_sse_full_loop || (is_encode_pass && do_recon)) { // Hsan: both pred and rec samples are needed @ MD and EP to perform the eob zero mode decision - pic_copy_kernel_func_ptr_array[(ASM_TYPES & PREAVX2_MASK) && 1][4]( + pic_copy_kernel_func_ptr_array[(ASM_TYPES & AVX2_MASK) && 1][4]( pred_buffer, pred_stride, recon_buffer, @@ -530,7 +530,7 @@ void perform_coding_loop( case TX_16X16: if (!(is_encode_pass && context_ptr->skip_eob_zero_mode_ep && *eob == 0)) { - residual_kernel_func_ptr_array[(ASM_TYPES & PREAVX2_MASK) && 1][16 >> 3]( + residual_kernel_func_ptr_array[(ASM_TYPES & AVX2_MASK) && 1][16 >> 3]( input_buffer, input_stride, pred_buffer, @@ -570,7 +570,7 @@ void perform_coding_loop( if (context_ptr->spatial_sse_full_loop || (is_encode_pass && do_recon)) { // Hsan: both pred and rec samples are needed @ MD and EP to perform the eob zero mode decision - pic_copy_kernel_func_ptr_array[(ASM_TYPES & PREAVX2_MASK) && 1][2]( + pic_copy_kernel_func_ptr_array[(ASM_TYPES & AVX2_MASK) && 1][2]( pred_buffer, pred_stride, recon_buffer, @@ -592,7 +592,7 @@ void perform_coding_loop( case TX_8X8: if (!(is_encode_pass && context_ptr->skip_eob_zero_mode_ep && *eob == 0)) { - residual_kernel_func_ptr_array[(ASM_TYPES & PREAVX2_MASK) && 1][8 >> 3]( + residual_kernel_func_ptr_array[(ASM_TYPES & AVX2_MASK) && 1][8 >> 3]( input_buffer, input_stride, pred_buffer, @@ -632,7 +632,7 @@ void perform_coding_loop( if (context_ptr->spatial_sse_full_loop || (is_encode_pass && do_recon)) { // Hsan: both pred and rec samples are needed @ MD and EP to perform the eob zero mode decision - pic_copy_kernel_func_ptr_array[(ASM_TYPES & PREAVX2_MASK) && 1][1]( + pic_copy_kernel_func_ptr_array[(ASM_TYPES & AVX2_MASK) && 1][1]( pred_buffer, pred_stride, recon_buffer, @@ -655,7 +655,7 @@ void perform_coding_loop( default: assert(tx_size == TX_4X4); if (!(is_encode_pass && context_ptr->skip_eob_zero_mode_ep && *eob == 0)) { - residual_kernel_func_ptr_array[(ASM_TYPES & PREAVX2_MASK) && 1][4 >> 3]( + residual_kernel_func_ptr_array[(ASM_TYPES & AVX2_MASK) && 1][4 >> 3]( input_buffer, input_stride, pred_buffer, @@ -701,7 +701,7 @@ void perform_coding_loop( } if (context_ptr->spatial_sse_full_loop || (is_encode_pass && do_recon)) { // Hsan: both pred and rec samples are needed @ MD and EP to perform the eob zero mode decision - pic_copy_kernel_func_ptr_array[(ASM_TYPES & PREAVX2_MASK) && 1][0]( + pic_copy_kernel_func_ptr_array[(ASM_TYPES & AVX2_MASK) && 1][0]( pred_buffer, pred_stride, recon_buffer, @@ -761,7 +761,7 @@ void perform_inv_trans_add( switch (tx_size) { case TX_32X32: - pic_copy_kernel_func_ptr_array[(ASM_TYPES & PREAVX2_MASK) && 1][4]( + pic_copy_kernel_func_ptr_array[(ASM_TYPES & AVX2_MASK) && 1][4]( pred_buffer, pred_stride, recon_buffer, @@ -781,7 +781,7 @@ void perform_inv_trans_add( case TX_16X16: - pic_copy_kernel_func_ptr_array[(ASM_TYPES & PREAVX2_MASK) && 1][2]( + pic_copy_kernel_func_ptr_array[(ASM_TYPES & AVX2_MASK) && 1][2]( pred_buffer, pred_stride, recon_buffer, @@ -802,7 +802,7 @@ void perform_inv_trans_add( break; case TX_8X8: - pic_copy_kernel_func_ptr_array[(ASM_TYPES & PREAVX2_MASK) && 1][1]( + pic_copy_kernel_func_ptr_array[(ASM_TYPES & AVX2_MASK) && 1][1]( pred_buffer, pred_stride, recon_buffer, @@ -824,7 +824,7 @@ void perform_inv_trans_add( default: assert(tx_size == TX_4X4); - pic_copy_kernel_func_ptr_array[(ASM_TYPES & PREAVX2_MASK) && 1][0]( + pic_copy_kernel_func_ptr_array[(ASM_TYPES & AVX2_MASK) && 1][0]( pred_buffer, pred_stride, recon_buffer, @@ -926,7 +926,7 @@ static void perform_dist_rate_calc( int tu_size = 1 << (2 + tx_size); if (context_ptr->spatial_sse_full_loop) { - tufull_distortion[DIST_CALC_RESIDUAL] = (int)spatialfull_distortion_kernel_func_ptr_array[(ASM_TYPES & PREAVX2_MASK) && 1][tx_size]( + tufull_distortion[DIST_CALC_RESIDUAL] = (int)spatialfull_distortion_kernel_func_ptr_array[(ASM_TYPES & AVX2_MASK) && 1][tx_size]( input_buffer, input_stride, recon_buffer, @@ -934,7 +934,7 @@ static void perform_dist_rate_calc( tu_size, tu_size); - tufull_distortion[DIST_CALC_PREDICTION] = (int)spatialfull_distortion_kernel_func_ptr_array[(ASM_TYPES & PREAVX2_MASK) && 1][tx_size]( + tufull_distortion[DIST_CALC_PREDICTION] = (int)spatialfull_distortion_kernel_func_ptr_array[(ASM_TYPES & AVX2_MASK) && 1][tx_size]( input_buffer, input_stride, pred_buffer, @@ -948,7 +948,7 @@ static void perform_dist_rate_calc( else { const int shift = tx_size == TX_32X32 ? 0 : 2; uint64_t tufull_distortionTemp[DIST_CALC_TOTAL]; - full_distortion_intrinsic_func_ptr_array[(ASM_TYPES & PREAVX2_MASK) && 1][*eob != 0][0][tu_size >> 3]( + full_distortion_intrinsic_func_ptr_array[(ASM_TYPES & AVX2_MASK) && 1][*eob != 0][0][tu_size >> 3]( trans_coeff_buffer, tu_size, recon_coeff_buffer, @@ -987,7 +987,7 @@ static void perform_dist_rate_calc( tufull_distortion[DIST_CALC_RESIDUAL] = (tufull_distortion[DIST_CALC_PREDICTION]); *tu_coeff_bits = 0; if (*eob) { - pic_copy_kernel_func_ptr_array[(ASM_TYPES & PREAVX2_MASK) && 1][tu_size == 32 ? 4 : tx_size]( + pic_copy_kernel_func_ptr_array[(ASM_TYPES & AVX2_MASK) && 1][tu_size == 32 ? 4 : tx_size]( pred_buffer, pred_stride, recon_buffer, @@ -1245,7 +1245,7 @@ static void perform_dist_rate_calc( candidate_buffer->candidate_ptr->eob[0][tu_index] = 0; int arr_index = tu_size[context_ptr->ep_block_stats_ptr->tx_size_uv] == 32 ? 4 : context_ptr->ep_block_stats_ptr->tx_size - 1; if (arr_index >= 0 && arr_index < 9) - pic_copy_kernel_func_ptr_array[(ASM_TYPES & PREAVX2_MASK) && 1][arr_index]( + pic_copy_kernel_func_ptr_array[(ASM_TYPES & AVX2_MASK) && 1][arr_index]( &(candidate_buffer->prediction_ptr->buffer_y[context_ptr->block_origin_index + pred_recon_tu_origin_index]), candidate_buffer->prediction_ptr->stride_y, &(candidate_buffer->recon_ptr->buffer_y[context_ptr->block_origin_index + pred_recon_tu_origin_index]), @@ -1259,7 +1259,7 @@ static void perform_dist_rate_calc( candidate_buffer->candidate_ptr->eob[1][0] = 0; int arr_index = tu_size[context_ptr->ep_block_stats_ptr->tx_size_uv] == 32 ? 4 : context_ptr->ep_block_stats_ptr->tx_size - 1; if (arr_index >= 0 && arr_index < 9) - pic_copy_kernel_func_ptr_array[(ASM_TYPES & PREAVX2_MASK) && 1][arr_index]( + pic_copy_kernel_func_ptr_array[(ASM_TYPES & AVX2_MASK) && 1][arr_index]( &(candidate_buffer->prediction_ptr->buffer_cb[context_ptr->block_chroma_origin_index]), candidate_buffer->prediction_ptr->stride_cb, &(candidate_buffer->recon_ptr->buffer_cb[context_ptr->block_chroma_origin_index]), @@ -1272,7 +1272,7 @@ static void perform_dist_rate_calc( candidate_buffer->candidate_ptr->eob[2][0] = 0; int arr_index = tu_size[context_ptr->ep_block_stats_ptr->tx_size_uv] == 32 ? 4 : context_ptr->ep_block_stats_ptr->tx_size - 1; if (arr_index >= 0 && arr_index < 9) - pic_copy_kernel_func_ptr_array[(ASM_TYPES & PREAVX2_MASK) && 1][arr_index]( + pic_copy_kernel_func_ptr_array[(ASM_TYPES & AVX2_MASK) && 1][arr_index]( &(candidate_buffer->prediction_ptr->buffer_cr[context_ptr->block_chroma_origin_index]), candidate_buffer->prediction_ptr->stride_cr, &(candidate_buffer->recon_ptr->buffer_cr[context_ptr->block_chroma_origin_index]), @@ -1914,7 +1914,7 @@ static void perform_dist_rate_calc( { (void)sb_ptr; if ((context_ptr->ep_block_stats_ptr->sq_size >> 3) < 9) { - pic_copy_kernel_func_ptr_array[(ASM_TYPES & PREAVX2_MASK) && 1][context_ptr->ep_block_stats_ptr->sq_size >> 3]( + pic_copy_kernel_func_ptr_array[(ASM_TYPES & AVX2_MASK) && 1][context_ptr->ep_block_stats_ptr->sq_size >> 3]( &(recon_src_ptr->buffer_y[context_ptr->ep_block_stats_ptr->origin_x + context_ptr->ep_block_stats_ptr->origin_y * recon_src_ptr->stride_y]), recon_src_ptr->stride_y, &(recon_dst_ptr->buffer_y[context_ptr->ep_block_stats_ptr->origin_x + context_ptr->ep_block_stats_ptr->origin_y * recon_dst_ptr->stride_y]), @@ -1926,7 +1926,7 @@ static void perform_dist_rate_calc( uint16_t chromaorigin_x = context_ptr->ep_block_stats_ptr->origin_x >> 1; uint16_t chromaorigin_y = context_ptr->ep_block_stats_ptr->origin_y >> 1; - pic_copy_kernel_func_ptr_array[(ASM_TYPES & PREAVX2_MASK) && 1][context_ptr->ep_block_stats_ptr->sq_size_uv >> 3]( + pic_copy_kernel_func_ptr_array[(ASM_TYPES & AVX2_MASK) && 1][context_ptr->ep_block_stats_ptr->sq_size_uv >> 3]( &(recon_src_ptr->buffer_cb[chromaorigin_x + chromaorigin_y * recon_src_ptr->stride_cb]), recon_src_ptr->stride_cb, &(recon_dst_ptr->buffer_cb[chromaorigin_x + chromaorigin_y * recon_dst_ptr->stride_cb]), @@ -1934,7 +1934,7 @@ static void perform_dist_rate_calc( context_ptr->ep_block_stats_ptr->sq_size_uv, context_ptr->ep_block_stats_ptr->sq_size_uv); - pic_copy_kernel_func_ptr_array[(ASM_TYPES & PREAVX2_MASK) && 1][context_ptr->ep_block_stats_ptr->sq_size_uv >> 3]( + pic_copy_kernel_func_ptr_array[(ASM_TYPES & AVX2_MASK) && 1][context_ptr->ep_block_stats_ptr->sq_size_uv >> 3]( &(recon_src_ptr->buffer_cr[chromaorigin_x + chromaorigin_y * recon_src_ptr->stride_cr]), recon_src_ptr->stride_cr, &(recon_dst_ptr->buffer_cr[chromaorigin_x + chromaorigin_y * recon_dst_ptr->stride_cr]), @@ -4215,7 +4215,7 @@ static void perform_dist_rate_calc( uint32_t pred_tu_origin_index = ((tu_index % 2) * tu_size[context_ptr->ep_block_stats_ptr->tx_size]) + ((tu_index > 1) * tu_size[context_ptr->ep_block_stats_ptr->tx_size] * context_ptr->prediction_buffer->stride_y); context_ptr->block_ptr->eob[0][tu_index] = 0; if (do_recon) - pic_copy_kernel_func_ptr_array[(ASM_TYPES & PREAVX2_MASK) && 1][tu_size[context_ptr->ep_block_stats_ptr->tx_size] == 32 ? 4 : context_ptr->ep_block_stats_ptr->tx_size]( + pic_copy_kernel_func_ptr_array[(ASM_TYPES & AVX2_MASK) && 1][tu_size[context_ptr->ep_block_stats_ptr->tx_size] == 32 ? 4 : context_ptr->ep_block_stats_ptr->tx_size]( &(context_ptr->prediction_buffer->buffer_y[context_ptr->block_origin_index + pred_tu_origin_index]), context_ptr->prediction_buffer->stride_y, &(context_ptr->recon_buffer->buffer_y[cuOriginReconIndex + pred_recon_tu_origin_index]), @@ -4227,7 +4227,7 @@ static void perform_dist_rate_calc( if (context_ptr->block_ptr->eob[1][0]) { context_ptr->block_ptr->eob[1][0] = 0; if (do_recon) - pic_copy_kernel_func_ptr_array[(ASM_TYPES & PREAVX2_MASK) && 1][tu_size[context_ptr->ep_block_stats_ptr->tx_size_uv] == 32 ? 4 : context_ptr->ep_block_stats_ptr->tx_size - 1]( + pic_copy_kernel_func_ptr_array[(ASM_TYPES & AVX2_MASK) && 1][tu_size[context_ptr->ep_block_stats_ptr->tx_size_uv] == 32 ? 4 : context_ptr->ep_block_stats_ptr->tx_size - 1]( &((context_ptr->prediction_buffer->buffer_cb)[context_ptr->block_chroma_origin_index]), context_ptr->prediction_buffer->stride_cb, &(context_ptr->recon_buffer->buffer_cb[cuChromaOriginReconIndex]), @@ -4238,7 +4238,7 @@ static void perform_dist_rate_calc( if (context_ptr->block_ptr->eob[2][0]) { context_ptr->block_ptr->eob[2][0] = 0; if (do_recon) - pic_copy_kernel_func_ptr_array[(ASM_TYPES & PREAVX2_MASK) && 1][tu_size[context_ptr->ep_block_stats_ptr->tx_size_uv] == 32 ? 4 : context_ptr->ep_block_stats_ptr->tx_size - 1]( + pic_copy_kernel_func_ptr_array[(ASM_TYPES & AVX2_MASK) && 1][tu_size[context_ptr->ep_block_stats_ptr->tx_size_uv] == 32 ? 4 : context_ptr->ep_block_stats_ptr->tx_size - 1]( &((context_ptr->prediction_buffer->buffer_cr)[context_ptr->block_chroma_origin_index]), context_ptr->prediction_buffer->stride_cr, &(context_ptr->recon_buffer->buffer_cr[cuChromaOriginReconIndex]), diff --git a/Source/Lib/Codec/EbEncHandle.c b/Source/Lib/Codec/EbEncHandle.c index 2669aae..8f346b1 100644 --- a/Source/Lib/Codec/EbEncHandle.c +++ b/Source/Lib/Codec/EbEncHandle.c @@ -239,12 +239,13 @@ static int32_t can_use_intel_avx512() | (1 << 30) // AVX-512BW | (1 << 31); // AVX-512VL + if (!check4th_gen_intel_core_features()) + return 0; + // ensure OS supports ZMM registers (and YMM, and XMM) if (!check_xcr0_zmm()) return 0; - if (!check4th_gen_intel_core_features()) - return 0; run_cpuid(7, 0, abcd); if ((abcd[1] & avx512_ebx_mask) != avx512_ebx_mask) diff --git a/Source/Lib/Codec/EbMotionEstimation.c b/Source/Lib/Codec/EbMotionEstimation.c index 76122b1..7ed5ddb 100644 --- a/Source/Lib/Codec/EbMotionEstimation.c +++ b/Source/Lib/Codec/EbMotionEstimation.c @@ -438,87 +438,87 @@ static void get_search_point_results( //---- 16x16 : 0 block_index = 0; search_position_index = search_position_tl_index; - sad_calculation_8x8_16x16_func_ptr_array[(ASM_TYPES & PREAVX2_MASK) && 1](src_ptr + block_index, src_stride, ref_ptr + search_position_index, refluma_stride, &p_best_sad8x8[0], &p_best_sad16x16[0], &p_best_mv8x8[0], &p_best_mv16x16[0], curr_mv, &p_sad16x16[0]); + sad_calculation_8x8_16x16_func_ptr_array[(ASM_TYPES & AVX2_MASK) && 1](src_ptr + block_index, src_stride, ref_ptr + search_position_index, refluma_stride, &p_best_sad8x8[0], &p_best_sad16x16[0], &p_best_mv8x8[0], &p_best_mv16x16[0], curr_mv, &p_sad16x16[0]); //---- 16x16 : 1 block_index = block_index + 16; search_position_index = search_position_tl_index + 16; - sad_calculation_8x8_16x16_func_ptr_array[(ASM_TYPES & PREAVX2_MASK) && 1](src_ptr + block_index, src_stride, ref_ptr + search_position_index, refluma_stride, &p_best_sad8x8[4], &p_best_sad16x16[1], &p_best_mv8x8[4], &p_best_mv16x16[1], curr_mv, &p_sad16x16[1]); + sad_calculation_8x8_16x16_func_ptr_array[(ASM_TYPES & AVX2_MASK) && 1](src_ptr + block_index, src_stride, ref_ptr + search_position_index, refluma_stride, &p_best_sad8x8[4], &p_best_sad16x16[1], &p_best_mv8x8[4], &p_best_mv16x16[1], curr_mv, &p_sad16x16[1]); //---- 16x16 : 4 block_index = block_index + 16; search_position_index = search_position_index + 16; - sad_calculation_8x8_16x16_func_ptr_array[(ASM_TYPES & PREAVX2_MASK) && 1](src_ptr + block_index, src_stride, ref_ptr + search_position_index, refluma_stride, &p_best_sad8x8[16], &p_best_sad16x16[4], &p_best_mv8x8[16], &p_best_mv16x16[4], curr_mv, &p_sad16x16[4]); + sad_calculation_8x8_16x16_func_ptr_array[(ASM_TYPES & AVX2_MASK) && 1](src_ptr + block_index, src_stride, ref_ptr + search_position_index, refluma_stride, &p_best_sad8x8[16], &p_best_sad16x16[4], &p_best_mv8x8[16], &p_best_mv16x16[4], curr_mv, &p_sad16x16[4]); //---- 16x16 : 5 block_index = block_index + 16; search_position_index = search_position_index + 16; - sad_calculation_8x8_16x16_func_ptr_array[(ASM_TYPES & PREAVX2_MASK) && 1](src_ptr + block_index, src_stride, ref_ptr + search_position_index, refluma_stride, &p_best_sad8x8[20], &p_best_sad16x16[5], &p_best_mv8x8[20], &p_best_mv16x16[5], curr_mv, &p_sad16x16[5]); + sad_calculation_8x8_16x16_func_ptr_array[(ASM_TYPES & AVX2_MASK) && 1](src_ptr + block_index, src_stride, ref_ptr + search_position_index, refluma_stride, &p_best_sad8x8[20], &p_best_sad16x16[5], &p_best_mv8x8[20], &p_best_mv16x16[5], curr_mv, &p_sad16x16[5]); //---- 16x16 : 2 block_index = src_next16x16_offset; search_position_index = search_position_tl_index + ref_next16x16_offset; - sad_calculation_8x8_16x16_func_ptr_array[(ASM_TYPES & PREAVX2_MASK) && 1](src_ptr + block_index, src_stride, ref_ptr + search_position_index, refluma_stride, &p_best_sad8x8[8], &p_best_sad16x16[2], &p_best_mv8x8[8], &p_best_mv16x16[2], curr_mv, &p_sad16x16[2]); + sad_calculation_8x8_16x16_func_ptr_array[(ASM_TYPES & AVX2_MASK) && 1](src_ptr + block_index, src_stride, ref_ptr + search_position_index, refluma_stride, &p_best_sad8x8[8], &p_best_sad16x16[2], &p_best_mv8x8[8], &p_best_mv16x16[2], curr_mv, &p_sad16x16[2]); //---- 16x16 : 3 block_index = block_index + 16; search_position_index = search_position_index + 16; - sad_calculation_8x8_16x16_func_ptr_array[(ASM_TYPES & PREAVX2_MASK) && 1](src_ptr + block_index, src_stride, ref_ptr + search_position_index, refluma_stride, &p_best_sad8x8[12], &p_best_sad16x16[3], &p_best_mv8x8[12], &p_best_mv16x16[3], curr_mv, &p_sad16x16[3]); + sad_calculation_8x8_16x16_func_ptr_array[(ASM_TYPES & AVX2_MASK) && 1](src_ptr + block_index, src_stride, ref_ptr + search_position_index, refluma_stride, &p_best_sad8x8[12], &p_best_sad16x16[3], &p_best_mv8x8[12], &p_best_mv16x16[3], curr_mv, &p_sad16x16[3]); //---- 16x16 : 6 block_index = block_index + 16; search_position_index = search_position_index + 16; - sad_calculation_8x8_16x16_func_ptr_array[(ASM_TYPES & PREAVX2_MASK) && 1](src_ptr + block_index, src_stride, ref_ptr + search_position_index, refluma_stride, &p_best_sad8x8[24], &p_best_sad16x16[6], &p_best_mv8x8[24], &p_best_mv16x16[6], curr_mv, &p_sad16x16[6]); + sad_calculation_8x8_16x16_func_ptr_array[(ASM_TYPES & AVX2_MASK) && 1](src_ptr + block_index, src_stride, ref_ptr + search_position_index, refluma_stride, &p_best_sad8x8[24], &p_best_sad16x16[6], &p_best_mv8x8[24], &p_best_mv16x16[6], curr_mv, &p_sad16x16[6]); //---- 16x16 : 7 block_index = block_index + 16; search_position_index = search_position_index + 16; - sad_calculation_8x8_16x16_func_ptr_array[(ASM_TYPES & PREAVX2_MASK) && 1](src_ptr + block_index, src_stride, ref_ptr + search_position_index, refluma_stride, &p_best_sad8x8[28], &p_best_sad16x16[7], &p_best_mv8x8[28], &p_best_mv16x16[7], curr_mv, &p_sad16x16[7]); + sad_calculation_8x8_16x16_func_ptr_array[(ASM_TYPES & AVX2_MASK) && 1](src_ptr + block_index, src_stride, ref_ptr + search_position_index, refluma_stride, &p_best_sad8x8[28], &p_best_sad16x16[7], &p_best_mv8x8[28], &p_best_mv16x16[7], curr_mv, &p_sad16x16[7]); //---- 16x16 : 8 block_index = (src_next16x16_offset << 1); search_position_index = search_position_tl_index + (ref_next16x16_offset << 1); - sad_calculation_8x8_16x16_func_ptr_array[(ASM_TYPES & PREAVX2_MASK) && 1](src_ptr + block_index, src_stride, ref_ptr + search_position_index, refluma_stride, &p_best_sad8x8[32], &p_best_sad16x16[8], &p_best_mv8x8[32], &p_best_mv16x16[8], curr_mv, &p_sad16x16[8]); + sad_calculation_8x8_16x16_func_ptr_array[(ASM_TYPES & AVX2_MASK) && 1](src_ptr + block_index, src_stride, ref_ptr + search_position_index, refluma_stride, &p_best_sad8x8[32], &p_best_sad16x16[8], &p_best_mv8x8[32], &p_best_mv16x16[8], curr_mv, &p_sad16x16[8]); //---- 16x16 : 9 block_index = block_index + 16; search_position_index = search_position_index + 16; - sad_calculation_8x8_16x16_func_ptr_array[(ASM_TYPES & PREAVX2_MASK) && 1](src_ptr + block_index, src_stride, ref_ptr + search_position_index, refluma_stride, &p_best_sad8x8[36], &p_best_sad16x16[9], &p_best_mv8x8[36], &p_best_mv16x16[9], curr_mv, &p_sad16x16[9]); + sad_calculation_8x8_16x16_func_ptr_array[(ASM_TYPES & AVX2_MASK) && 1](src_ptr + block_index, src_stride, ref_ptr + search_position_index, refluma_stride, &p_best_sad8x8[36], &p_best_sad16x16[9], &p_best_mv8x8[36], &p_best_mv16x16[9], curr_mv, &p_sad16x16[9]); //---- 16x16 : 12 block_index = block_index + 16; search_position_index = search_position_index + 16; search_position_index = search_position_index + 16; - sad_calculation_8x8_16x16_func_ptr_array[(ASM_TYPES & PREAVX2_MASK) && 1](src_ptr + block_index, src_stride, ref_ptr + search_position_index, refluma_stride, &p_best_sad8x8[48], &p_best_sad16x16[12], &p_best_mv8x8[48], &p_best_mv16x16[12], curr_mv, &p_sad16x16[12]); + sad_calculation_8x8_16x16_func_ptr_array[(ASM_TYPES & AVX2_MASK) && 1](src_ptr + block_index, src_stride, ref_ptr + search_position_index, refluma_stride, &p_best_sad8x8[48], &p_best_sad16x16[12], &p_best_mv8x8[48], &p_best_mv16x16[12], curr_mv, &p_sad16x16[12]); //---- 16x16 : 13 block_index = block_index + 16; search_position_index = search_position_index + 16; - sad_calculation_8x8_16x16_func_ptr_array[(ASM_TYPES & PREAVX2_MASK) && 1](src_ptr + block_index, src_stride, ref_ptr + search_position_index, refluma_stride, &p_best_sad8x8[52], &p_best_sad16x16[13], &p_best_mv8x8[52], &p_best_mv16x16[13], curr_mv, &p_sad16x16[13]); + sad_calculation_8x8_16x16_func_ptr_array[(ASM_TYPES & AVX2_MASK) && 1](src_ptr + block_index, src_stride, ref_ptr + search_position_index, refluma_stride, &p_best_sad8x8[52], &p_best_sad16x16[13], &p_best_mv8x8[52], &p_best_mv16x16[13], curr_mv, &p_sad16x16[13]); //---- 16x16 : 10 block_index = (src_next16x16_offset * 3); search_position_index = search_position_tl_index + (ref_next16x16_offset * 3); - sad_calculation_8x8_16x16_func_ptr_array[(ASM_TYPES & PREAVX2_MASK) && 1](src_ptr + block_index, src_stride, ref_ptr + search_position_index, refluma_stride, &p_best_sad8x8[40], &p_best_sad16x16[10], &p_best_mv8x8[40], &p_best_mv16x16[10], curr_mv, &p_sad16x16[10]); + sad_calculation_8x8_16x16_func_ptr_array[(ASM_TYPES & AVX2_MASK) && 1](src_ptr + block_index, src_stride, ref_ptr + search_position_index, refluma_stride, &p_best_sad8x8[40], &p_best_sad16x16[10], &p_best_mv8x8[40], &p_best_mv16x16[10], curr_mv, &p_sad16x16[10]); //---- 16x16 : 11 block_index = block_index + 16; search_position_index = search_position_index + 16; - sad_calculation_8x8_16x16_func_ptr_array[(ASM_TYPES & PREAVX2_MASK) && 1](src_ptr + block_index, src_stride, ref_ptr + search_position_index, refluma_stride, &p_best_sad8x8[44], &p_best_sad16x16[11], &p_best_mv8x8[44], &p_best_mv16x16[11], curr_mv, &p_sad16x16[11]); + sad_calculation_8x8_16x16_func_ptr_array[(ASM_TYPES & AVX2_MASK) && 1](src_ptr + block_index, src_stride, ref_ptr + search_position_index, refluma_stride, &p_best_sad8x8[44], &p_best_sad16x16[11], &p_best_mv8x8[44], &p_best_mv16x16[11], curr_mv, &p_sad16x16[11]); //---- 16x16 : 14 block_index = block_index + 16; search_position_index = search_position_index + 16; - sad_calculation_8x8_16x16_func_ptr_array[(ASM_TYPES & PREAVX2_MASK) && 1](src_ptr + block_index, src_stride, ref_ptr + search_position_index, refluma_stride, &p_best_sad8x8[56], &p_best_sad16x16[14], &p_best_mv8x8[56], &p_best_mv16x16[14], curr_mv, &p_sad16x16[14]); + sad_calculation_8x8_16x16_func_ptr_array[(ASM_TYPES & AVX2_MASK) && 1](src_ptr + block_index, src_stride, ref_ptr + search_position_index, refluma_stride, &p_best_sad8x8[56], &p_best_sad16x16[14], &p_best_mv8x8[56], &p_best_mv16x16[14], curr_mv, &p_sad16x16[14]); //---- 16x16 : 15 block_index = block_index + 16; search_position_index = search_position_index + 16; - sad_calculation_8x8_16x16_func_ptr_array[(ASM_TYPES & PREAVX2_MASK) && 1](src_ptr + block_index, src_stride, ref_ptr + search_position_index, refluma_stride, &p_best_sad8x8[60], &p_best_sad16x16[15], &p_best_mv8x8[60], &p_best_mv16x16[15], curr_mv, &p_sad16x16[15]); + sad_calculation_8x8_16x16_func_ptr_array[(ASM_TYPES & AVX2_MASK) && 1](src_ptr + block_index, src_stride, ref_ptr + search_position_index, refluma_stride, &p_best_sad8x8[60], &p_best_sad16x16[15], &p_best_mv8x8[60], &p_best_mv16x16[15], curr_mv, &p_sad16x16[15]); - sad_calculation_32x32_64x64_func_ptr_array[(ASM_TYPES & PREAVX2_MASK) && 1](p_sad16x16, p_best_sad32x32, p_best_sad64x64, p_best_mv32x32, p_best_mv64x64, curr_mv); + sad_calculation_32x32_64x64_func_ptr_array[(ASM_TYPES & AVX2_MASK) && 1](p_sad16x16, p_best_sad32x32, p_best_sad64x64, p_best_mv32x32, p_best_mv64x64, curr_mv); } @@ -623,7 +623,7 @@ void interpolate_search_region_avc( // Half pel interpolation of the search region using f1 -> posb_buffer if (search_area_width_for_asm){ - avc_style_uni_pred_luma_if_function_ptr_array[(ASM_TYPES & PREAVX2_MASK) && 1][1]( + avc_style_uni_pred_luma_if_function_ptr_array[(ASM_TYPES & AVX2_MASK) && 1][1]( search_region_buffer - (ME_FILTER_TAP >> 1) * luma_stride - (ME_FILTER_TAP >> 1) + 1, luma_stride, context_ptr->posb_buffer[list_index][0], @@ -636,7 +636,7 @@ void interpolate_search_region_avc( // Half pel interpolation of the search region using f1 -> posh_buffer if (search_area_width_for_asm){ - avc_style_uni_pred_luma_if_function_ptr_array[(ASM_TYPES & PREAVX2_MASK) && 1][2]( + avc_style_uni_pred_luma_if_function_ptr_array[(ASM_TYPES & AVX2_MASK) && 1][2]( search_region_buffer - (ME_FILTER_TAP >> 1) * luma_stride - 1 + luma_stride, luma_stride, context_ptr->posh_buffer[list_index][0], @@ -649,7 +649,7 @@ void interpolate_search_region_avc( if (search_area_width_for_asm){ // Half pel interpolation of the search region using f1 -> posj_buffer - avc_style_uni_pred_luma_if_function_ptr_array[(ASM_TYPES & PREAVX2_MASK) && 1][2]( + avc_style_uni_pred_luma_if_function_ptr_array[(ASM_TYPES & AVX2_MASK) && 1][2]( context_ptr->posb_buffer[list_index][0] + context_ptr->interpolated_stride, context_ptr->interpolated_stride, context_ptr->posj_buffer[list_index][0], @@ -733,7 +733,7 @@ static void pu_half_pel_refinement( // Compute SSD for the best full search candidate if (context_ptr->fractional_search_method == SSD_SEARCH) { - *p_best_ssd = (uint32_t) spatialfull_distortion_kernel_func_ptr_array[(ASM_TYPES & PREAVX2_MASK) && 1][Log2f(pu_width) - 2]( + *p_best_ssd = (uint32_t) spatialfull_distortion_kernel_func_ptr_array[(ASM_TYPES & AVX2_MASK) && 1][Log2f(pu_width) - 2]( &(context_ptr->sb_src_ptr[pu_sb_buffer_index]), context_ptr->sb_src_stride, &(ref_buffer[y_search_index * ref_stride + x_search_index]), @@ -747,7 +747,7 @@ static void pu_half_pel_refinement( search_region_index = x_search_index + (int16_t)context_ptr->interpolated_stride * y_search_index; distortion_left_position = (context_ptr->fractional_search_method == SSD_SEARCH) ? - spatialfull_distortion_kernel_func_ptr_array[(ASM_TYPES & PREAVX2_MASK) && 1][Log2f(pu_width) - 2](&(context_ptr->sb_src_ptr[pu_sb_buffer_index]), context_ptr->sb_src_stride, &(posb_buffer[search_region_index]), context_ptr->interpolated_stride, pu_height, pu_width) : + spatialfull_distortion_kernel_func_ptr_array[(ASM_TYPES & AVX2_MASK) && 1][Log2f(pu_width) - 2](&(context_ptr->sb_src_ptr[pu_sb_buffer_index]), context_ptr->sb_src_stride, &(posb_buffer[search_region_index]), context_ptr->interpolated_stride, pu_height, pu_width) : (context_ptr->fractional_search_method == SUB_SAD_SEARCH) ? (n_x_m_sad_kernel_func_ptr_array[(ASM_TYPES & AVX2_MASK) && 1][pu_width >> 3](&(context_ptr->sb_src_ptr[pu_sb_buffer_index]), context_ptr->sb_src_stride << 1, &(posb_buffer[search_region_index]), context_ptr->interpolated_stride << 1, pu_height >> 1, pu_width)) << 1 : n_x_m_sad_kernel_func_ptr_array[(ASM_TYPES & AVX2_MASK) && 1][pu_width >> 3](&(context_ptr->sb_src_ptr[pu_sb_buffer_index]), context_ptr->sb_src_stride, &(posb_buffer[search_region_index]), context_ptr->interpolated_stride, pu_height, pu_width); @@ -770,7 +770,7 @@ static void pu_half_pel_refinement( search_region_index++; distortion_right_position = (context_ptr->fractional_search_method == SSD_SEARCH) ? - spatialfull_distortion_kernel_func_ptr_array[(ASM_TYPES & PREAVX2_MASK) && 1][Log2f(pu_width) - 2](&(context_ptr->sb_src_ptr[pu_sb_buffer_index]), context_ptr->sb_src_stride, &(posb_buffer[search_region_index]), context_ptr->interpolated_stride, pu_height, pu_width) : + spatialfull_distortion_kernel_func_ptr_array[(ASM_TYPES & AVX2_MASK) && 1][Log2f(pu_width) - 2](&(context_ptr->sb_src_ptr[pu_sb_buffer_index]), context_ptr->sb_src_stride, &(posb_buffer[search_region_index]), context_ptr->interpolated_stride, pu_height, pu_width) : (context_ptr->fractional_search_method == SUB_SAD_SEARCH) ? (n_x_m_sad_kernel_func_ptr_array[(ASM_TYPES & AVX2_MASK) && 1][pu_width >> 3](&(context_ptr->sb_src_ptr[pu_sb_buffer_index]), context_ptr->sb_src_stride << 1, &(posb_buffer[search_region_index]), context_ptr->interpolated_stride << 1, pu_height >> 1, pu_width)) << 1 : n_x_m_sad_kernel_func_ptr_array[(ASM_TYPES & AVX2_MASK) && 1][pu_width >> 3](&(context_ptr->sb_src_ptr[pu_sb_buffer_index]), context_ptr->sb_src_stride, &(posb_buffer[search_region_index]), context_ptr->interpolated_stride, pu_height, pu_width); @@ -794,7 +794,7 @@ static void pu_half_pel_refinement( search_region_index = x_search_index + (int16_t)context_ptr->interpolated_stride * y_search_index; distortion_top_position = (context_ptr->fractional_search_method == SSD_SEARCH) ? - spatialfull_distortion_kernel_func_ptr_array[(ASM_TYPES & PREAVX2_MASK) && 1][Log2f(pu_width) - 2](&(context_ptr->sb_src_ptr[pu_sb_buffer_index]), context_ptr->sb_src_stride, &(posh_buffer[search_region_index]), context_ptr->interpolated_stride, pu_height, pu_width) : + spatialfull_distortion_kernel_func_ptr_array[(ASM_TYPES & AVX2_MASK) && 1][Log2f(pu_width) - 2](&(context_ptr->sb_src_ptr[pu_sb_buffer_index]), context_ptr->sb_src_stride, &(posh_buffer[search_region_index]), context_ptr->interpolated_stride, pu_height, pu_width) : (context_ptr->fractional_search_method == SUB_SAD_SEARCH) ? (n_x_m_sad_kernel_func_ptr_array[(ASM_TYPES & AVX2_MASK) && 1][pu_width >> 3](&(context_ptr->sb_src_ptr[pu_sb_buffer_index]), context_ptr->sb_src_stride << 1, &(posh_buffer[search_region_index]), context_ptr->interpolated_stride << 1, pu_height >> 1, pu_width)) << 1 : n_x_m_sad_kernel_func_ptr_array[(ASM_TYPES & AVX2_MASK) && 1][pu_width >> 3](&(context_ptr->sb_src_ptr[pu_sb_buffer_index]), context_ptr->sb_src_stride, &(posh_buffer[search_region_index]), context_ptr->interpolated_stride, pu_height, pu_width); @@ -817,7 +817,7 @@ static void pu_half_pel_refinement( search_region_index += (int16_t)context_ptr->interpolated_stride; distortion_bottom_position = (context_ptr->fractional_search_method == SSD_SEARCH) ? - spatialfull_distortion_kernel_func_ptr_array[(ASM_TYPES & PREAVX2_MASK) && 1][Log2f(pu_width) - 2](&(context_ptr->sb_src_ptr[pu_sb_buffer_index]), context_ptr->sb_src_stride, &(posh_buffer[search_region_index]), context_ptr->interpolated_stride, pu_height, pu_width) : + spatialfull_distortion_kernel_func_ptr_array[(ASM_TYPES & AVX2_MASK) && 1][Log2f(pu_width) - 2](&(context_ptr->sb_src_ptr[pu_sb_buffer_index]), context_ptr->sb_src_stride, &(posh_buffer[search_region_index]), context_ptr->interpolated_stride, pu_height, pu_width) : (context_ptr->fractional_search_method == SUB_SAD_SEARCH) ? (n_x_m_sad_kernel_func_ptr_array[(ASM_TYPES & AVX2_MASK) && 1][pu_width >> 3](&(context_ptr->sb_src_ptr[pu_sb_buffer_index]), context_ptr->sb_src_stride << 1, &(posh_buffer[search_region_index]), context_ptr->interpolated_stride << 1, pu_height >> 1, pu_width)) << 1 : n_x_m_sad_kernel_func_ptr_array[(ASM_TYPES & AVX2_MASK) && 1][pu_width >> 3](&(context_ptr->sb_src_ptr[pu_sb_buffer_index]), context_ptr->sb_src_stride, &(posh_buffer[search_region_index]), context_ptr->interpolated_stride, pu_height, pu_width); @@ -840,7 +840,7 @@ static void pu_half_pel_refinement( search_region_index = x_search_index + (int16_t)context_ptr->interpolated_stride * y_search_index; distortion_top_left_position = (context_ptr->fractional_search_method == SSD_SEARCH) ? - spatialfull_distortion_kernel_func_ptr_array[(ASM_TYPES & PREAVX2_MASK) && 1][Log2f(pu_width) - 2](&(context_ptr->sb_src_ptr[pu_sb_buffer_index]), context_ptr->sb_src_stride, &(posj_buffer[search_region_index]), context_ptr->interpolated_stride, pu_height, pu_width) : + spatialfull_distortion_kernel_func_ptr_array[(ASM_TYPES & AVX2_MASK) && 1][Log2f(pu_width) - 2](&(context_ptr->sb_src_ptr[pu_sb_buffer_index]), context_ptr->sb_src_stride, &(posj_buffer[search_region_index]), context_ptr->interpolated_stride, pu_height, pu_width) : (context_ptr->fractional_search_method == SUB_SAD_SEARCH) ? (n_x_m_sad_kernel_func_ptr_array[(ASM_TYPES & AVX2_MASK) && 1][pu_width >> 3](&(context_ptr->sb_src_ptr[pu_sb_buffer_index]), context_ptr->sb_src_stride << 1, &(posj_buffer[search_region_index]), context_ptr->interpolated_stride << 1, pu_height >> 1, pu_width)) << 1 : n_x_m_sad_kernel_func_ptr_array[(ASM_TYPES & AVX2_MASK) && 1][pu_width >> 3](&(context_ptr->sb_src_ptr[pu_sb_buffer_index]), context_ptr->sb_src_stride, &(posj_buffer[search_region_index]), context_ptr->interpolated_stride, pu_height, pu_width); @@ -864,7 +864,7 @@ static void pu_half_pel_refinement( search_region_index++; distortion_top_right_position = (context_ptr->fractional_search_method == SSD_SEARCH) ? - spatialfull_distortion_kernel_func_ptr_array[(ASM_TYPES & PREAVX2_MASK) && 1][Log2f(pu_width) - 2](&(context_ptr->sb_src_ptr[pu_sb_buffer_index]), context_ptr->sb_src_stride, &(posj_buffer[search_region_index]), context_ptr->interpolated_stride, pu_height, pu_width) : + spatialfull_distortion_kernel_func_ptr_array[(ASM_TYPES & AVX2_MASK) && 1][Log2f(pu_width) - 2](&(context_ptr->sb_src_ptr[pu_sb_buffer_index]), context_ptr->sb_src_stride, &(posj_buffer[search_region_index]), context_ptr->interpolated_stride, pu_height, pu_width) : (context_ptr->fractional_search_method == SUB_SAD_SEARCH) ? (n_x_m_sad_kernel_func_ptr_array[(ASM_TYPES & AVX2_MASK) && 1][pu_width >> 3](&(context_ptr->sb_src_ptr[pu_sb_buffer_index]), context_ptr->sb_src_stride << 1, &(posj_buffer[search_region_index]), context_ptr->interpolated_stride << 1, pu_height >> 1, pu_width)) << 1 : n_x_m_sad_kernel_func_ptr_array[(ASM_TYPES & AVX2_MASK) && 1][pu_width >> 3](&(context_ptr->sb_src_ptr[pu_sb_buffer_index]), context_ptr->sb_src_stride, &(posj_buffer[search_region_index]), context_ptr->interpolated_stride, pu_height, pu_width); @@ -887,7 +887,7 @@ static void pu_half_pel_refinement( search_region_index += (int16_t)context_ptr->interpolated_stride; distortion_bottom_right_position = (context_ptr->fractional_search_method == SSD_SEARCH) ? - spatialfull_distortion_kernel_func_ptr_array[(ASM_TYPES & PREAVX2_MASK) && 1][Log2f(pu_width) - 2](&(context_ptr->sb_src_ptr[pu_sb_buffer_index]), context_ptr->sb_src_stride, &(posj_buffer[search_region_index]), context_ptr->interpolated_stride, pu_height, pu_width) : + spatialfull_distortion_kernel_func_ptr_array[(ASM_TYPES & AVX2_MASK) && 1][Log2f(pu_width) - 2](&(context_ptr->sb_src_ptr[pu_sb_buffer_index]), context_ptr->sb_src_stride, &(posj_buffer[search_region_index]), context_ptr->interpolated_stride, pu_height, pu_width) : (context_ptr->fractional_search_method == SUB_SAD_SEARCH) ? (n_x_m_sad_kernel_func_ptr_array[(ASM_TYPES & AVX2_MASK) && 1][pu_width >> 3](&(context_ptr->sb_src_ptr[pu_sb_buffer_index]), context_ptr->sb_src_stride << 1, &(posj_buffer[search_region_index]), context_ptr->interpolated_stride << 1, pu_height >> 1, pu_width)) << 1 : n_x_m_sad_kernel_func_ptr_array[(ASM_TYPES & AVX2_MASK) && 1][pu_width >> 3](&(context_ptr->sb_src_ptr[pu_sb_buffer_index]), context_ptr->sb_src_stride, &(posj_buffer[search_region_index]), context_ptr->interpolated_stride, pu_height, pu_width); @@ -910,7 +910,7 @@ static void pu_half_pel_refinement( search_region_index--; distortion_bottom_left_position = (context_ptr->fractional_search_method == SSD_SEARCH) ? - spatialfull_distortion_kernel_func_ptr_array[(ASM_TYPES & PREAVX2_MASK) && 1][Log2f(pu_width) - 2](&(context_ptr->sb_src_ptr[pu_sb_buffer_index]), context_ptr->sb_src_stride, &(posj_buffer[search_region_index]), context_ptr->interpolated_stride, pu_height, pu_width) : + spatialfull_distortion_kernel_func_ptr_array[(ASM_TYPES & AVX2_MASK) && 1][Log2f(pu_width) - 2](&(context_ptr->sb_src_ptr[pu_sb_buffer_index]), context_ptr->sb_src_stride, &(posj_buffer[search_region_index]), context_ptr->interpolated_stride, pu_height, pu_width) : (context_ptr->fractional_search_method == SUB_SAD_SEARCH) ? (n_x_m_sad_kernel_func_ptr_array[(ASM_TYPES & AVX2_MASK) && 1][pu_width >> 3](&(context_ptr->sb_src_ptr[pu_sb_buffer_index]), context_ptr->sb_src_stride << 1, &(posj_buffer[search_region_index]), context_ptr->interpolated_stride << 1, pu_height >> 1, pu_width)) << 1 : (n_x_m_sad_kernel_func_ptr_array[(ASM_TYPES & AVX2_MASK) && 1][pu_width >> 3](&(context_ptr->sb_src_ptr[pu_sb_buffer_index]), context_ptr->sb_src_stride, &(posj_buffer[search_region_index]), context_ptr->interpolated_stride, pu_height, pu_width)); @@ -1242,12 +1242,12 @@ static void pu_quarter_pel_refinement_on_the_fly( dist = (context_ptr->fractional_search_method == SSD_SEARCH) ? combined_averaging_ssd(&(context_ptr->sb_buffer[pu_sb_buffer_index]), MAX_SB_SIZE, buf1[0] + search_region_index1, buf1_stride[0], buf2[0] + search_region_index2, buf2_stride[0], pu_height, pu_width) : (context_ptr->fractional_search_method == SUB_SAD_SEARCH) ? - (nx_m_sad_averaging_kernel_func_ptr_array[(ASM_TYPES & PREAVX2_MASK) && 1][pu_width >> 3](&(context_ptr->sb_buffer[pu_sb_buffer_index]), MAX_SB_SIZE << 1, buf1[0] + search_region_index1, buf1_stride[0] << 1, buf2[0] + search_region_index2, buf2_stride[0] << 1, pu_height >> 1, pu_width)) << 1 : - nx_m_sad_averaging_kernel_func_ptr_array[(ASM_TYPES & PREAVX2_MASK) && 1][pu_width >> 3](&(context_ptr->sb_buffer[pu_sb_buffer_index]), MAX_SB_SIZE, buf1[0] + search_region_index1, buf1_stride[0], buf2[0] + search_region_index2, buf2_stride[0], pu_height, pu_width); + (nx_m_sad_averaging_kernel_func_ptr_array[(ASM_TYPES & AVX2_MASK) && 1][pu_width >> 3](&(context_ptr->sb_buffer[pu_sb_buffer_index]), MAX_SB_SIZE << 1, buf1[0] + search_region_index1, buf1_stride[0] << 1, buf2[0] + search_region_index2, buf2_stride[0] << 1, pu_height >> 1, pu_width)) << 1 : + nx_m_sad_averaging_kernel_func_ptr_array[(ASM_TYPES & AVX2_MASK) && 1][pu_width >> 3](&(context_ptr->sb_buffer[pu_sb_buffer_index]), MAX_SB_SIZE, buf1[0] + search_region_index1, buf1_stride[0], buf2[0] + search_region_index2, buf2_stride[0], pu_height, pu_width); if (context_ptr->fractional_search_method == SSD_SEARCH) { if (dist < *p_best_ssd) { - *p_best_sad = (uint32_t)nx_m_sad_averaging_kernel_func_ptr_array[(ASM_TYPES & PREAVX2_MASK) && 1][pu_width >> 3](&(context_ptr->sb_buffer[pu_sb_buffer_index]), MAX_SB_SIZE, buf1[0] + search_region_index1, buf1_stride[0], buf2[0] + search_region_index2, buf2_stride[0], pu_height, pu_width); + *p_best_sad = (uint32_t)nx_m_sad_averaging_kernel_func_ptr_array[(ASM_TYPES & AVX2_MASK) && 1][pu_width >> 3](&(context_ptr->sb_buffer[pu_sb_buffer_index]), MAX_SB_SIZE, buf1[0] + search_region_index1, buf1_stride[0], buf2[0] + search_region_index2, buf2_stride[0], pu_height, pu_width); *p_best_mv = ((uint16_t)y_mv_quarter[0] << 16) | ((uint16_t)x_mv_quarter[0]); *p_best_ssd = (uint32_t)dist; } @@ -1269,12 +1269,12 @@ static void pu_quarter_pel_refinement_on_the_fly( dist = (context_ptr->fractional_search_method == SSD_SEARCH) ? combined_averaging_ssd(&(context_ptr->sb_buffer[pu_sb_buffer_index]), MAX_SB_SIZE, buf1[1] + search_region_index1, buf1_stride[1], buf2[1] + search_region_index2, buf2_stride[1], pu_height, pu_width) : (context_ptr->fractional_search_method == SUB_SAD_SEARCH) ? - (nx_m_sad_averaging_kernel_func_ptr_array[(ASM_TYPES & PREAVX2_MASK) && 1][pu_width >> 3](&(context_ptr->sb_buffer[pu_sb_buffer_index]), MAX_SB_SIZE << 1, buf1[1] + search_region_index1, buf1_stride[1] << 1, buf2[1] + search_region_index2, buf2_stride[1] << 1, pu_height >> 1, pu_width)) << 1 : - nx_m_sad_averaging_kernel_func_ptr_array[(ASM_TYPES & PREAVX2_MASK) && 1][pu_width >> 3](&(context_ptr->sb_buffer[pu_sb_buffer_index]), MAX_SB_SIZE, buf1[1] + search_region_index1, buf1_stride[1], buf2[1] + search_region_index2, buf2_stride[1], pu_height, pu_width); + (nx_m_sad_averaging_kernel_func_ptr_array[(ASM_TYPES & AVX2_MASK) && 1][pu_width >> 3](&(context_ptr->sb_buffer[pu_sb_buffer_index]), MAX_SB_SIZE << 1, buf1[1] + search_region_index1, buf1_stride[1] << 1, buf2[1] + search_region_index2, buf2_stride[1] << 1, pu_height >> 1, pu_width)) << 1 : + nx_m_sad_averaging_kernel_func_ptr_array[(ASM_TYPES & AVX2_MASK) && 1][pu_width >> 3](&(context_ptr->sb_buffer[pu_sb_buffer_index]), MAX_SB_SIZE, buf1[1] + search_region_index1, buf1_stride[1], buf2[1] + search_region_index2, buf2_stride[1], pu_height, pu_width); if (context_ptr->fractional_search_method == SSD_SEARCH) { if (dist < *p_best_ssd) { - *p_best_sad = (uint32_t)nx_m_sad_averaging_kernel_func_ptr_array[(ASM_TYPES & PREAVX2_MASK) && 1][pu_width >> 3](&(context_ptr->sb_buffer[pu_sb_buffer_index]), MAX_SB_SIZE, buf1[1] + search_region_index1, buf1_stride[1], buf2[1] + search_region_index2, buf2_stride[1], pu_height, pu_width); + *p_best_sad = (uint32_t)nx_m_sad_averaging_kernel_func_ptr_array[(ASM_TYPES & AVX2_MASK) && 1][pu_width >> 3](&(context_ptr->sb_buffer[pu_sb_buffer_index]), MAX_SB_SIZE, buf1[1] + search_region_index1, buf1_stride[1], buf2[1] + search_region_index2, buf2_stride[1], pu_height, pu_width); *p_best_mv = ((uint16_t)y_mv_quarter[1] << 16) | ((uint16_t)x_mv_quarter[1]); *p_best_ssd = (uint32_t)dist; } @@ -1296,12 +1296,12 @@ static void pu_quarter_pel_refinement_on_the_fly( dist = (context_ptr->fractional_search_method == SSD_SEARCH) ? combined_averaging_ssd(&(context_ptr->sb_buffer[pu_sb_buffer_index]), MAX_SB_SIZE, buf1[2] + search_region_index1, buf1_stride[2], buf2[2] + search_region_index2, buf2_stride[2], pu_height, pu_width) : (context_ptr->fractional_search_method == SUB_SAD_SEARCH) ? - (nx_m_sad_averaging_kernel_func_ptr_array[(ASM_TYPES & PREAVX2_MASK) && 1][pu_width >> 3](&(context_ptr->sb_buffer[pu_sb_buffer_index]), MAX_SB_SIZE << 1, buf1[2] + search_region_index1, buf1_stride[2] << 1, buf2[2] + search_region_index2, buf2_stride[2] << 1, pu_height >> 1, pu_width)) << 1 : - nx_m_sad_averaging_kernel_func_ptr_array[(ASM_TYPES & PREAVX2_MASK) && 1][pu_width >> 3](&(context_ptr->sb_buffer[pu_sb_buffer_index]), MAX_SB_SIZE, buf1[2] + search_region_index1, buf1_stride[2], buf2[2] + search_region_index2, buf2_stride[2], pu_height, pu_width); + (nx_m_sad_averaging_kernel_func_ptr_array[(ASM_TYPES & AVX2_MASK) && 1][pu_width >> 3](&(context_ptr->sb_buffer[pu_sb_buffer_index]), MAX_SB_SIZE << 1, buf1[2] + search_region_index1, buf1_stride[2] << 1, buf2[2] + search_region_index2, buf2_stride[2] << 1, pu_height >> 1, pu_width)) << 1 : + nx_m_sad_averaging_kernel_func_ptr_array[(ASM_TYPES & AVX2_MASK) && 1][pu_width >> 3](&(context_ptr->sb_buffer[pu_sb_buffer_index]), MAX_SB_SIZE, buf1[2] + search_region_index1, buf1_stride[2], buf2[2] + search_region_index2, buf2_stride[2], pu_height, pu_width); if (context_ptr->fractional_search_method == SSD_SEARCH) { if (dist < *p_best_ssd) { - *p_best_sad = (uint32_t)nx_m_sad_averaging_kernel_func_ptr_array[(ASM_TYPES & PREAVX2_MASK) && 1][pu_width >> 3](&(context_ptr->sb_buffer[pu_sb_buffer_index]), MAX_SB_SIZE, buf1[2] + search_region_index1, buf1_stride[2], buf2[2] + search_region_index2, buf2_stride[2], pu_height, pu_width); + *p_best_sad = (uint32_t)nx_m_sad_averaging_kernel_func_ptr_array[(ASM_TYPES & AVX2_MASK) && 1][pu_width >> 3](&(context_ptr->sb_buffer[pu_sb_buffer_index]), MAX_SB_SIZE, buf1[2] + search_region_index1, buf1_stride[2], buf2[2] + search_region_index2, buf2_stride[2], pu_height, pu_width); *p_best_mv = ((uint16_t)y_mv_quarter[2] << 16) | ((uint16_t)x_mv_quarter[2]); *p_best_ssd = (uint32_t)dist; } @@ -1323,12 +1323,12 @@ static void pu_quarter_pel_refinement_on_the_fly( dist = (context_ptr->fractional_search_method == SSD_SEARCH) ? combined_averaging_ssd(&(context_ptr->sb_buffer[pu_sb_buffer_index]), MAX_SB_SIZE, buf1[3] + search_region_index1, buf1_stride[3], buf2[3] + search_region_index2, buf2_stride[3], pu_height, pu_width) : (context_ptr->fractional_search_method == SUB_SAD_SEARCH) ? - (nx_m_sad_averaging_kernel_func_ptr_array[(ASM_TYPES & PREAVX2_MASK) && 1][pu_width >> 3](&(context_ptr->sb_buffer[pu_sb_buffer_index]), MAX_SB_SIZE << 1, buf1[3] + search_region_index1, buf1_stride[3] << 1, buf2[3] + search_region_index2, buf2_stride[3] << 1, pu_height >> 1, pu_width)) << 1 : - nx_m_sad_averaging_kernel_func_ptr_array[(ASM_TYPES & PREAVX2_MASK) && 1][pu_width >> 3](&(context_ptr->sb_buffer[pu_sb_buffer_index]), MAX_SB_SIZE, buf1[3] + search_region_index1, buf1_stride[3], buf2[3] + search_region_index2, buf2_stride[3], pu_height, pu_width); + (nx_m_sad_averaging_kernel_func_ptr_array[(ASM_TYPES & AVX2_MASK) && 1][pu_width >> 3](&(context_ptr->sb_buffer[pu_sb_buffer_index]), MAX_SB_SIZE << 1, buf1[3] + search_region_index1, buf1_stride[3] << 1, buf2[3] + search_region_index2, buf2_stride[3] << 1, pu_height >> 1, pu_width)) << 1 : + nx_m_sad_averaging_kernel_func_ptr_array[(ASM_TYPES & AVX2_MASK) && 1][pu_width >> 3](&(context_ptr->sb_buffer[pu_sb_buffer_index]), MAX_SB_SIZE, buf1[3] + search_region_index1, buf1_stride[3], buf2[3] + search_region_index2, buf2_stride[3], pu_height, pu_width); if (context_ptr->fractional_search_method == SSD_SEARCH) { if (dist < *p_best_ssd) { - *p_best_sad = (uint32_t)nx_m_sad_averaging_kernel_func_ptr_array[(ASM_TYPES & PREAVX2_MASK) && 1][pu_width >> 3](&(context_ptr->sb_buffer[pu_sb_buffer_index]), MAX_SB_SIZE, buf1[3] + search_region_index1, buf1_stride[3], buf2[3] + search_region_index2, buf2_stride[3], pu_height, pu_width); + *p_best_sad = (uint32_t)nx_m_sad_averaging_kernel_func_ptr_array[(ASM_TYPES & AVX2_MASK) && 1][pu_width >> 3](&(context_ptr->sb_buffer[pu_sb_buffer_index]), MAX_SB_SIZE, buf1[3] + search_region_index1, buf1_stride[3], buf2[3] + search_region_index2, buf2_stride[3], pu_height, pu_width); *p_best_mv = ((uint16_t)y_mv_quarter[3] << 16) | ((uint16_t)x_mv_quarter[3]); *p_best_ssd = (uint32_t)dist; } @@ -1350,12 +1350,12 @@ static void pu_quarter_pel_refinement_on_the_fly( dist = (context_ptr->fractional_search_method == SSD_SEARCH) ? combined_averaging_ssd(&(context_ptr->sb_buffer[pu_sb_buffer_index]), MAX_SB_SIZE, buf1[4] + search_region_index1, buf1_stride[4], buf2[4] + search_region_index2, buf2_stride[4], pu_height, pu_width) : (context_ptr->fractional_search_method == SUB_SAD_SEARCH) ? - (nx_m_sad_averaging_kernel_func_ptr_array[(ASM_TYPES & PREAVX2_MASK) && 1][pu_width >> 3](&(context_ptr->sb_buffer[pu_sb_buffer_index]), MAX_SB_SIZE << 1, buf1[4] + search_region_index1, buf1_stride[4] << 1, buf2[4] + search_region_index2, buf2_stride[4] << 1, pu_height >> 1, pu_width)) << 1 : - nx_m_sad_averaging_kernel_func_ptr_array[(ASM_TYPES & PREAVX2_MASK) && 1][pu_width >> 3](&(context_ptr->sb_buffer[pu_sb_buffer_index]), MAX_SB_SIZE, buf1[4] + search_region_index1, buf1_stride[4], buf2[4] + search_region_index2, buf2_stride[4], pu_height, pu_width); + (nx_m_sad_averaging_kernel_func_ptr_array[(ASM_TYPES & AVX2_MASK) && 1][pu_width >> 3](&(context_ptr->sb_buffer[pu_sb_buffer_index]), MAX_SB_SIZE << 1, buf1[4] + search_region_index1, buf1_stride[4] << 1, buf2[4] + search_region_index2, buf2_stride[4] << 1, pu_height >> 1, pu_width)) << 1 : + nx_m_sad_averaging_kernel_func_ptr_array[(ASM_TYPES & AVX2_MASK) && 1][pu_width >> 3](&(context_ptr->sb_buffer[pu_sb_buffer_index]), MAX_SB_SIZE, buf1[4] + search_region_index1, buf1_stride[4], buf2[4] + search_region_index2, buf2_stride[4], pu_height, pu_width); if (context_ptr->fractional_search_method == SSD_SEARCH) { if (dist < *p_best_ssd) { - *p_best_sad = (uint32_t)nx_m_sad_averaging_kernel_func_ptr_array[(ASM_TYPES & PREAVX2_MASK) && 1][pu_width >> 3](&(context_ptr->sb_buffer[pu_sb_buffer_index]), MAX_SB_SIZE, buf1[4] + search_region_index1, buf1_stride[4], buf2[4] + search_region_index2, buf2_stride[4], pu_height, pu_width); + *p_best_sad = (uint32_t)nx_m_sad_averaging_kernel_func_ptr_array[(ASM_TYPES & AVX2_MASK) && 1][pu_width >> 3](&(context_ptr->sb_buffer[pu_sb_buffer_index]), MAX_SB_SIZE, buf1[4] + search_region_index1, buf1_stride[4], buf2[4] + search_region_index2, buf2_stride[4], pu_height, pu_width); *p_best_mv = ((uint16_t)y_mv_quarter[4] << 16) | ((uint16_t)x_mv_quarter[4]); *p_best_ssd = (uint32_t)dist; } @@ -1378,12 +1378,12 @@ static void pu_quarter_pel_refinement_on_the_fly( dist = (context_ptr->fractional_search_method == SSD_SEARCH) ? combined_averaging_ssd(&(context_ptr->sb_buffer[pu_sb_buffer_index]), MAX_SB_SIZE, buf1[5] + search_region_index1, buf1_stride[5], buf2[5] + search_region_index2, buf2_stride[5], pu_height, pu_width) :\ (context_ptr->fractional_search_method == SUB_SAD_SEARCH) ? - (nx_m_sad_averaging_kernel_func_ptr_array[(ASM_TYPES & PREAVX2_MASK) && 1][pu_width >> 3](&(context_ptr->sb_buffer[pu_sb_buffer_index]), MAX_SB_SIZE << 1, buf1[5] + search_region_index1, buf1_stride[5] << 1, buf2[5] + search_region_index2, buf2_stride[5] << 1, pu_height >> 1, pu_width)) << 1 : - nx_m_sad_averaging_kernel_func_ptr_array[(ASM_TYPES & PREAVX2_MASK) && 1][pu_width >> 3](&(context_ptr->sb_buffer[pu_sb_buffer_index]), MAX_SB_SIZE, buf1[5] + search_region_index1, buf1_stride[5], buf2[5] + search_region_index2, buf2_stride[5], pu_height, pu_width); + (nx_m_sad_averaging_kernel_func_ptr_array[(ASM_TYPES & AVX2_MASK) && 1][pu_width >> 3](&(context_ptr->sb_buffer[pu_sb_buffer_index]), MAX_SB_SIZE << 1, buf1[5] + search_region_index1, buf1_stride[5] << 1, buf2[5] + search_region_index2, buf2_stride[5] << 1, pu_height >> 1, pu_width)) << 1 : + nx_m_sad_averaging_kernel_func_ptr_array[(ASM_TYPES & AVX2_MASK) && 1][pu_width >> 3](&(context_ptr->sb_buffer[pu_sb_buffer_index]), MAX_SB_SIZE, buf1[5] + search_region_index1, buf1_stride[5], buf2[5] + search_region_index2, buf2_stride[5], pu_height, pu_width); if (context_ptr->fractional_search_method == SSD_SEARCH) { if (dist < *p_best_ssd) { - *p_best_sad = (uint32_t)nx_m_sad_averaging_kernel_func_ptr_array[(ASM_TYPES & PREAVX2_MASK) && 1][pu_width >> 3](&(context_ptr->sb_buffer[pu_sb_buffer_index]), MAX_SB_SIZE, buf1[5] + search_region_index1, buf1_stride[5], buf2[5] + search_region_index2, buf2_stride[5], pu_height, pu_width); + *p_best_sad = (uint32_t)nx_m_sad_averaging_kernel_func_ptr_array[(ASM_TYPES & AVX2_MASK) && 1][pu_width >> 3](&(context_ptr->sb_buffer[pu_sb_buffer_index]), MAX_SB_SIZE, buf1[5] + search_region_index1, buf1_stride[5], buf2[5] + search_region_index2, buf2_stride[5], pu_height, pu_width); *p_best_mv = ((uint16_t)y_mv_quarter[5] << 16) | ((uint16_t)x_mv_quarter[5]); *p_best_ssd = (uint32_t)dist; } @@ -1405,12 +1405,12 @@ static void pu_quarter_pel_refinement_on_the_fly( dist = (context_ptr->fractional_search_method == SSD_SEARCH) ? combined_averaging_ssd(&(context_ptr->sb_buffer[pu_sb_buffer_index]), MAX_SB_SIZE, buf1[6] + search_region_index1, buf1_stride[6], buf2[6] + search_region_index2, buf2_stride[6], pu_height, pu_width) : (context_ptr->fractional_search_method == SUB_SAD_SEARCH) ? - (nx_m_sad_averaging_kernel_func_ptr_array[(ASM_TYPES & PREAVX2_MASK) && 1][pu_width >> 3](&(context_ptr->sb_buffer[pu_sb_buffer_index]), MAX_SB_SIZE << 1, buf1[6] + search_region_index1, buf1_stride[6] << 1, buf2[6] + search_region_index2, buf2_stride[6] << 1, pu_height >> 1, pu_width)) << 1 : - nx_m_sad_averaging_kernel_func_ptr_array[(ASM_TYPES & PREAVX2_MASK) && 1][pu_width >> 3](&(context_ptr->sb_buffer[pu_sb_buffer_index]), MAX_SB_SIZE, buf1[6] + search_region_index1, buf1_stride[6], buf2[6] + search_region_index2, buf2_stride[6], pu_height, pu_width); + (nx_m_sad_averaging_kernel_func_ptr_array[(ASM_TYPES & AVX2_MASK) && 1][pu_width >> 3](&(context_ptr->sb_buffer[pu_sb_buffer_index]), MAX_SB_SIZE << 1, buf1[6] + search_region_index1, buf1_stride[6] << 1, buf2[6] + search_region_index2, buf2_stride[6] << 1, pu_height >> 1, pu_width)) << 1 : + nx_m_sad_averaging_kernel_func_ptr_array[(ASM_TYPES & AVX2_MASK) && 1][pu_width >> 3](&(context_ptr->sb_buffer[pu_sb_buffer_index]), MAX_SB_SIZE, buf1[6] + search_region_index1, buf1_stride[6], buf2[6] + search_region_index2, buf2_stride[6], pu_height, pu_width); if (context_ptr->fractional_search_method == SSD_SEARCH) { if (dist < *p_best_ssd) { - *p_best_sad = (uint32_t)nx_m_sad_averaging_kernel_func_ptr_array[(ASM_TYPES & PREAVX2_MASK) && 1][pu_width >> 3](&(context_ptr->sb_buffer[pu_sb_buffer_index]), MAX_SB_SIZE, buf1[6] + search_region_index1, buf1_stride[6], buf2[6] + search_region_index2, buf2_stride[6], pu_height, pu_width); + *p_best_sad = (uint32_t)nx_m_sad_averaging_kernel_func_ptr_array[(ASM_TYPES & AVX2_MASK) && 1][pu_width >> 3](&(context_ptr->sb_buffer[pu_sb_buffer_index]), MAX_SB_SIZE, buf1[6] + search_region_index1, buf1_stride[6], buf2[6] + search_region_index2, buf2_stride[6], pu_height, pu_width); *p_best_mv = ((uint16_t)y_mv_quarter[6] << 16) | ((uint16_t)x_mv_quarter[6]); *p_best_ssd = (uint32_t)dist; } @@ -1432,12 +1432,12 @@ static void pu_quarter_pel_refinement_on_the_fly( dist = (context_ptr->fractional_search_method == SSD_SEARCH) ? combined_averaging_ssd(&(context_ptr->sb_buffer[pu_sb_buffer_index]), MAX_SB_SIZE, buf1[7] + search_region_index1, buf1_stride[7], buf2[7] + search_region_index2, buf2_stride[7], pu_height, pu_width) : (context_ptr->fractional_search_method == SUB_SAD_SEARCH) ? - (nx_m_sad_averaging_kernel_func_ptr_array[(ASM_TYPES & PREAVX2_MASK) && 1][pu_width >> 3](&(context_ptr->sb_buffer[pu_sb_buffer_index]), MAX_SB_SIZE << 1, buf1[7] + search_region_index1, buf1_stride[7] << 1, buf2[7] + search_region_index2, buf2_stride[7] << 1, pu_height >> 1, pu_width)) << 1 : - nx_m_sad_averaging_kernel_func_ptr_array[(ASM_TYPES & PREAVX2_MASK) && 1][pu_width >> 3](&(context_ptr->sb_buffer[pu_sb_buffer_index]), MAX_SB_SIZE, buf1[7] + search_region_index1, buf1_stride[7], buf2[7] + search_region_index2, buf2_stride[7], pu_height, pu_width); + (nx_m_sad_averaging_kernel_func_ptr_array[(ASM_TYPES & AVX2_MASK) && 1][pu_width >> 3](&(context_ptr->sb_buffer[pu_sb_buffer_index]), MAX_SB_SIZE << 1, buf1[7] + search_region_index1, buf1_stride[7] << 1, buf2[7] + search_region_index2, buf2_stride[7] << 1, pu_height >> 1, pu_width)) << 1 : + nx_m_sad_averaging_kernel_func_ptr_array[(ASM_TYPES & AVX2_MASK) && 1][pu_width >> 3](&(context_ptr->sb_buffer[pu_sb_buffer_index]), MAX_SB_SIZE, buf1[7] + search_region_index1, buf1_stride[7], buf2[7] + search_region_index2, buf2_stride[7], pu_height, pu_width); if (context_ptr->fractional_search_method == SSD_SEARCH) { if (dist < *p_best_ssd) { - *p_best_sad = (uint32_t)nx_m_sad_averaging_kernel_func_ptr_array[(ASM_TYPES & PREAVX2_MASK) && 1][pu_width >> 3](&(context_ptr->sb_buffer[pu_sb_buffer_index]), MAX_SB_SIZE, buf1[7] + search_region_index1, buf1_stride[7], buf2[7] + search_region_index2, buf2_stride[7], pu_height, pu_width); + *p_best_sad = (uint32_t)nx_m_sad_averaging_kernel_func_ptr_array[(ASM_TYPES & AVX2_MASK) && 1][pu_width >> 3](&(context_ptr->sb_buffer[pu_sb_buffer_index]), MAX_SB_SIZE, buf1[7] + search_region_index1, buf1_stride[7], buf2[7] + search_region_index2, buf2_stride[7], pu_height, pu_width); *p_best_mv = ((uint16_t)y_mv_quarter[7] << 16) | ((uint16_t)x_mv_quarter[7]); *p_best_ssd = (uint32_t)dist; } @@ -2513,7 +2513,7 @@ static void quarter_pel_compensation( buf1 = buf1 + pu_shift_x_index + pu_shift_y_index * ref_stride1; buf2 = buf2 + pu_shift_x_index + pu_shift_y_index * ref_stride2; - picture_average_array[(ASM_TYPES & PREAVX2_MASK) && 1](buf1, ref_stride1, buf2, ref_stride2, dst, dst_stride, pu_width, pu_height); + picture_average_array[(ASM_TYPES & AVX2_MASK) && 1](buf1, ref_stride1, buf2, ref_stride2, dst, dst_stride, pu_width, pu_height); return; } @@ -2636,7 +2636,7 @@ uint32_t bi_pred_averging( // bi-pred luma me_candidate->distortion = (context_ptr->fractional_search_method == SUB_SAD_SEARCH) ? - nx_m_sad_averaging_kernel_func_ptr_array[(ASM_TYPES & PREAVX2_MASK) && 1][pu_width >> 3](source_pic, + nx_m_sad_averaging_kernel_func_ptr_array[(ASM_TYPES & AVX2_MASK) && 1][pu_width >> 3](source_pic, luma_stride << 1, ptr_list0, ptr_list0_stride << 1, @@ -2644,7 +2644,7 @@ uint32_t bi_pred_averging( ptr_list1_stride << 1, pu_height >> 1, pu_width) << 1 : - nx_m_sad_averaging_kernel_func_ptr_array[(ASM_TYPES & PREAVX2_MASK) && 1][pu_width >> 3](source_pic, + nx_m_sad_averaging_kernel_func_ptr_array[(ASM_TYPES & AVX2_MASK) && 1][pu_width >> 3](source_pic, luma_stride, ptr_list0, ptr_list0_stride, @@ -4086,7 +4086,7 @@ EbErrorType motion_estimate_sb( { { - initialize_buffer_32bits_func_ptr_array[(ASM_TYPES & PREAVX2_MASK) && 1](context_ptr->p_sb_best_sad[list_index][0], 21, 1, MAX_SAD_VALUE); + initialize_buffer_32bits_func_ptr_array[(ASM_TYPES & AVX2_MASK) && 1](context_ptr->p_sb_best_sad[list_index][0], 21, 1, MAX_SAD_VALUE); context_ptr->p_best_sad64x64 = &(context_ptr->p_sb_best_sad[list_index][0][ME_TIER_ZERO_PU_64x64]); context_ptr->p_best_sad32x32 = &(context_ptr->p_sb_best_sad[list_index][0][ME_TIER_ZERO_PU_32x32_0]); context_ptr->p_best_sad16x16 = &(context_ptr->p_sb_best_sad[list_index][0][ME_TIER_ZERO_PU_16x16_0]); diff --git a/Source/Lib/Codec/EbPictureAnalysisProcess.c b/Source/Lib/Codec/EbPictureAnalysisProcess.c index a5b0a09..5dc36cf 100644 --- a/Source/Lib/Codec/EbPictureAnalysisProcess.c +++ b/Source/Lib/Codec/EbPictureAnalysisProcess.c @@ -3508,7 +3508,7 @@ void sub_sample_luma_generate_pixel_intensity_histogram_bins( // Initialize bins to 1 - initialize_buffer_32bits_func_ptr_array[(ASM_TYPES & PREAVX2_MASK) && 1](picture_control_set_ptr->picture_histogram[region_in_picture_width_index][region_in_picture_height_index][0], 64, 0, 1); + initialize_buffer_32bits_func_ptr_array[(ASM_TYPES & AVX2_MASK) && 1](picture_control_set_ptr->picture_histogram[region_in_picture_width_index][region_in_picture_height_index][0], 64, 0, 1); region_width_offset = (region_in_picture_width_index == sequence_control_set_ptr->picture_analysis_number_of_regions_per_width - 1) ? input_picture_ptr->width - (sequence_control_set_ptr->picture_analysis_number_of_regions_per_width * region_width) : @@ -3567,8 +3567,8 @@ void sub_sample_chroma_generate_pixel_intensity_histogram_bins( // Initialize bins to 1 - initialize_buffer_32bits_func_ptr_array[(ASM_TYPES & PREAVX2_MASK) && 1](picture_control_set_ptr->picture_histogram[region_in_picture_width_index][region_in_picture_height_index][1], 64, 0, 1); - initialize_buffer_32bits_func_ptr_array[(ASM_TYPES & PREAVX2_MASK) && 1](picture_control_set_ptr->picture_histogram[region_in_picture_width_index][region_in_picture_height_index][2], 64, 0, 1); + initialize_buffer_32bits_func_ptr_array[(ASM_TYPES & AVX2_MASK) && 1](picture_control_set_ptr->picture_histogram[region_in_picture_width_index][region_in_picture_height_index][1], 64, 0, 1); + initialize_buffer_32bits_func_ptr_array[(ASM_TYPES & AVX2_MASK) && 1](picture_control_set_ptr->picture_histogram[region_in_picture_width_index][region_in_picture_height_index][2], 64, 0, 1); region_width_offset = (region_in_picture_width_index == sequence_control_set_ptr->picture_analysis_number_of_regions_per_width - 1) ? input_picture_ptr->width - (sequence_control_set_ptr->picture_analysis_number_of_regions_per_width * region_width) : diff --git a/Source/Lib/Codec/EbPictureOperators.c b/Source/Lib/Codec/EbPictureOperators.c index 2c9c252..b0efd1e 100644 --- a/Source/Lib/Codec/EbPictureOperators.c +++ b/Source/Lib/Codec/EbPictureOperators.c @@ -28,7 +28,7 @@ void picture_addition( uint32_t height) { - addition_kernel_func_ptr_array[(ASM_TYPES & PREAVX2_MASK) && 1][width >> 3]( + addition_kernel_func_ptr_array[(ASM_TYPES & AVX2_MASK) && 1][width >> 3]( pred_ptr, pred_stride, residual_ptr, @@ -63,7 +63,7 @@ EbErrorType picture_copy8_bit( // Execute the Kernels if (component_mask & PICTURE_BUFFER_DESC_Y_FLAG) { - pic_copy_kernel_func_ptr_array[(ASM_TYPES & PREAVX2_MASK) && 1][area_width>>3]( + pic_copy_kernel_func_ptr_array[(ASM_TYPES & AVX2_MASK) && 1][area_width>>3]( &(src->buffer_y[src_luma_origin_index]), src->stride_y, &(dst->buffer_y[dst_luma_origin_index]), @@ -74,7 +74,7 @@ EbErrorType picture_copy8_bit( if (component_mask & PICTURE_BUFFER_DESC_Cb_FLAG) { - pic_copy_kernel_func_ptr_array[(ASM_TYPES & PREAVX2_MASK) && 1][chroma_area_width >> 3]( + pic_copy_kernel_func_ptr_array[(ASM_TYPES & AVX2_MASK) && 1][chroma_area_width >> 3]( &(src->buffer_cb[src_chroma_origin_index]), src->stride_cb, &(dst->buffer_cb[dst_chroma_origin_index]), @@ -85,7 +85,7 @@ EbErrorType picture_copy8_bit( if (component_mask & PICTURE_BUFFER_DESC_Cr_FLAG) { - pic_copy_kernel_func_ptr_array[(ASM_TYPES & PREAVX2_MASK) && 1][chroma_area_width >> 3]( + pic_copy_kernel_func_ptr_array[(ASM_TYPES & AVX2_MASK) && 1][chroma_area_width >> 3]( &(src->buffer_cr[src_chroma_origin_index]), src->stride_cr, &(dst->buffer_cr[dst_chroma_origin_index]), @@ -114,7 +114,7 @@ void picture_sub_sampled_residual( uint8_t last_line) //the last line has correct prediction data, so no duplication to be done. { - residual_kernel_sub_sampled_func_ptr_array[(ASM_TYPES & PREAVX2_MASK) && 1][area_width>>3]( + residual_kernel_sub_sampled_func_ptr_array[(ASM_TYPES & AVX2_MASK) && 1][area_width>>3]( input, input_stride, pred, @@ -142,7 +142,7 @@ void picture_residual( uint32_t area_height) { - residual_kernel_func_ptr_array[(ASM_TYPES & PREAVX2_MASK) && 1][area_width>>3]( + residual_kernel_func_ptr_array[(ASM_TYPES & AVX2_MASK) && 1][area_width>>3]( input, input_stride, pred, @@ -170,7 +170,7 @@ void picture_residual16bit( uint32_t area_height) { - residual_kernel_func_ptr_array16_bit[(ASM_TYPES & PREAVX2_MASK) && 1]( + residual_kernel_func_ptr_array16_bit[(ASM_TYPES & AVX2_MASK) && 1]( input, input_stride, pred, @@ -203,7 +203,7 @@ EbErrorType picture_full_distortion( distortion[0] = 0; distortion[1] = 0; // Y - full_distortion_intrinsic_func_ptr_array[(ASM_TYPES & PREAVX2_MASK) && 1][eob != 0][0][area_size >> 3]( + full_distortion_intrinsic_func_ptr_array[(ASM_TYPES & AVX2_MASK) && 1][eob != 0][0][area_size >> 3]( &(((int16_t*) coeff->buffer_y)[coeff_origin_index]), coeff->stride_y, &(((int16_t*) recon_coeff->buffer_y)[recon_coeff_origin_index]), @@ -227,7 +227,7 @@ void extract_8bit_data( ) { - unpack_8bit_func_ptr_array_16bit[((width & 3) == 0) && ((height & 1)== 0)][(ASM_TYPES & PREAVX2_MASK) && 1]( + unpack_8bit_func_ptr_array_16bit[((width & 3) == 0) && ((height & 1)== 0)][(ASM_TYPES & AVX2_MASK) && 1]( in16_bit_buffer, in_stride, out8_bit_buffer,