@@ -1964,6 +1964,150 @@ AP4_AesCtrBlockCipher::Process(const AP4_UI08* input,
19641964 return AP4_SUCCESS;
19651965}
19661966
1967+ #if AP4_AES_BLOCK_SIZE == 16 && AP4_AES_KEY_LENGTH == 16 && defined(__AES__) && defined(__SSE4_2__)
1968+ #define AP4_ENABLE_AESNI
1969+
1970+ #include < immintrin.h>
1971+
1972+ #define cpuid (func,ax,bx,cx,dx )\
1973+ __asm__ __volatile__ (" cpuid" :\
1974+ " =a" (ax), " =b" (bx), " =c" (cx), " =d" (dx) : " a" (func));
1975+
1976+ static bool g_SupportAesNI = false ;
1977+ __attribute__ ((constructor)) static void detect_aesni ()
1978+ {
1979+ unsigned int a,b,c,d;
1980+ cpuid (1 , a,b,c,d);
1981+ g_SupportAesNI = c & 0x2000000 ;
1982+ }
1983+
1984+ #endif
1985+
1986+ #ifdef AP4_ENABLE_AESNI
1987+ template <AP4_Size NB>
1988+ static void aesni_process_NB_blocks (AP4_UI08* Out, AP4_UI08 const * In, __m128i& CB, AP4_UI08 const * Keys)
1989+ {
1990+ __m128i CBs[NB];
1991+ const __m128i CB_ = CB;
1992+ #pragma unroll
1993+ for (AP4_Size i = 0 ; i < NB; ++i) {
1994+ CBs[i] = _mm_add_epi64 (CB_, _mm_set_epi32 (0 ,i+1 ,0 ,0 ));
1995+ }
1996+ CB = CBs[NB-1 ];
1997+ const __m128i Bswap64 = _mm_set_epi8 (8 ,9 ,10 ,11 ,12 ,13 ,14 ,15 ,7 ,6 ,5 ,4 ,3 ,2 ,1 ,0 );
1998+ #pragma unroll
1999+ for (AP4_Size i = 0 ; i < NB; ++i) {
2000+ CBs[i] = _mm_shuffle_epi8 (CBs[i], Bswap64);
2001+ }
2002+
2003+ __m128i EncrCBs[NB];
2004+ const __m128i Key0 = _mm_loadu_si128 ((const __m128i*)Keys);
2005+ #pragma unroll
2006+ for (AP4_Size i = 0 ; i < NB; ++i) {
2007+ EncrCBs[i] = _mm_xor_si128 (CBs[i], Key0);
2008+ }
2009+ #pragma unroll
2010+ for (AP4_Size R = 1 ; R < 10 ; R++) {
2011+ const __m128i Key = _mm_loadu_si128 ((const __m128i*)&Keys[R*AP4_AES_KEY_LENGTH]);
2012+ #pragma unroll
2013+ for (AP4_Size i = 0 ; i < NB; ++i) {
2014+ EncrCBs[i] = _mm_aesenc_si128 (EncrCBs[i], Key);
2015+ }
2016+ }
2017+
2018+ const __m128i KeyLast = _mm_loadu_si128 ((const __m128i*)&Keys[10 *AP4_AES_KEY_LENGTH]);
2019+ #pragma unroll
2020+ for (AP4_Size i = 0 ; i < NB; ++i) {
2021+ EncrCBs[i] = _mm_aesenclast_si128 (EncrCBs[i],
2022+ _mm_xor_si128 (KeyLast, _mm_loadu_si128 ((const __m128i*)&In[i*AP4_AES_BLOCK_SIZE])));
2023+ }
2024+
2025+ #pragma unroll
2026+ for (AP4_Size i = 0 ; i < NB; ++i) {
2027+ _mm_storeu_si128 ((__m128i*)&Out[i*AP4_AES_BLOCK_SIZE], EncrCBs[i]);
2028+ }
2029+ }
2030+
2031+ static void process (AP4_UI08* output, AP4_UI08 const * input, AP4_Size input_size, const AP4_UI08* iv, const AP4_UI08* Keys)
2032+ {
2033+ union {
2034+ AP4_UI08 B[AP4_AES_BLOCK_SIZE];
2035+ __m128i V;
2036+ } Counter;
2037+
2038+ if (iv) {
2039+ memcpy (&Counter.B [0 ], iv, AP4_AES_BLOCK_SIZE);
2040+ Counter.V = _mm_shuffle_epi8 (Counter.V , _mm_set_epi8 (8 ,9 ,10 ,11 ,12 ,13 ,14 ,15 ,7 ,6 ,5 ,4 ,3 ,2 ,1 ,0 ));
2041+ Counter.V = _mm_sub_epi64 (Counter.V , _mm_set_epi32 (0 ,1 ,0 ,0 ));
2042+ } else {
2043+ memset (&Counter.B [0 ], 0 , AP4_AES_BLOCK_SIZE);
2044+ }
2045+
2046+ // First, process blocks eight by eight (Intel recommandation)
2047+ const AP4_Size Size8B = 8 *AP4_AES_BLOCK_SIZE;
2048+ const AP4_Size End8B = (input_size/Size8B)*Size8B;
2049+ for (AP4_Size i = 0 ; i < End8B; i += Size8B) {
2050+ aesni_process_NB_blocks<8 >(&output[i], &input[i], Counter.V , Keys);
2051+ }
2052+ // Process the remaining blocks!
2053+ const AP4_Size RemBlocks = (input_size-End8B)/AP4_AES_BLOCK_SIZE;
2054+ AP4_Size CurIdx = End8B;
2055+ switch (RemBlocks) {
2056+ #define FINAL_BLOCKS (N )\
2057+ case N:\
2058+ aesni_process_NB_blocks<N>(&output[CurIdx], &input[CurIdx], Counter.V , Keys);\
2059+ CurIdx += N*AP4_AES_BLOCK_SIZE;\
2060+ break ;
2061+
2062+ FINAL_BLOCKS (7 )
2063+ FINAL_BLOCKS (6 )
2064+ FINAL_BLOCKS (5 )
2065+ FINAL_BLOCKS (4 )
2066+ FINAL_BLOCKS (3 )
2067+ FINAL_BLOCKS (2 )
2068+ FINAL_BLOCKS (1 )
2069+ #undef FINAL_BLOCKS
2070+ }
2071+ const AP4_Size Rem = input_size-CurIdx;
2072+ assert (Rem < 16 && " too many remaining bytes!" );
2073+ if (Rem > 0 ) {
2074+ // Last block
2075+ AP4_UI08 LastBlock[AP4_AES_BLOCK_SIZE];
2076+ memcpy (&LastBlock[0 ], &input[CurIdx], Rem);
2077+ aesni_process_NB_blocks<1 >(&LastBlock[0 ], &LastBlock[0 ], Counter.V , Keys);
2078+ memcpy (&output[CurIdx], &LastBlock[0 ], Rem);
2079+ }
2080+ }
2081+
2082+ class AP4_AesNICtrBlockCipher : public AP4_AesBlockCipher
2083+ {
2084+ public:
2085+ AP4_AesNICtrBlockCipher (CipherDirection direction,
2086+ unsigned int counter_size,
2087+ aes_ctx* context) :
2088+ AP4_AesBlockCipher (direction, CTR, context)
2089+ {
2090+ assert (counter_size == 8 && " counter size must be 8 bytes!" );
2091+ }
2092+
2093+ // AP4_BlockCipher methods
2094+ virtual AP4_Result Process (const AP4_UI08* input,
2095+ AP4_Size input_size,
2096+ AP4_UI08* output,
2097+ const AP4_UI08* iv);
2098+ };
2099+
2100+ AP4_Result AP4_AesNICtrBlockCipher::Process (const AP4_UI08* input,
2101+ AP4_Size input_size,
2102+ AP4_UI08* output,
2103+ const AP4_UI08* iv)
2104+ {
2105+ assert (m_Context->n_rnd == 10 && " this only works for AES128!" );
2106+ process (output, input, input_size, iv, (const AP4_UI08*) &m_Context->k_sch [0 ]);
2107+ return AP4_SUCCESS;
2108+ }
2109+ #endif // AP4_ENABLE_AESNI
2110+
19672111/* ----------------------------------------------------------------------
19682112| AP4_AesBlockCipher::Create
19692113+---------------------------------------------------------------------*/
@@ -1995,7 +2139,16 @@ AP4_AesBlockCipher::Create(const AP4_UI08* key,
19952139 if (ctr_params) {
19962140 counter_size = ctr_params->counter_size ;
19972141 }
1998- cipher = new AP4_AesCtrBlockCipher (direction, counter_size, context);
2142+ #ifdef AP4_ENABLE_AESNI
2143+ if (g_SupportAesNI && (counter_size == 8 )) {
2144+ cipher = new AP4_AesNICtrBlockCipher (direction, counter_size, context);
2145+ }
2146+ else {
2147+ #endif
2148+ cipher = new AP4_AesCtrBlockCipher (direction, counter_size, context);
2149+ #ifdef AP4_ENABLE_AESNI
2150+ }
2151+ #endif
19992152 break ;
20002153 }
20012154
0 commit comments