@@ -1933,7 +1933,7 @@ AP4_AesCtrBlockCipher::Process(const AP4_UI08* input,
19331933 } else {
19341934 AP4_SetMemory (counter, 0 , AP4_AES_BLOCK_SIZE);
19351935 }
1936-
1936+
19371937 // process all blocks
19381938 while (input_size) {
19391939 AP4_UI08 block[AP4_AES_BLOCK_SIZE];
@@ -1963,6 +1963,150 @@ AP4_AesCtrBlockCipher::Process(const AP4_UI08* input,
19631963 return AP4_SUCCESS;
19641964}
19651965
1966+ #if AP4_AES_BLOCK_SIZE == 16 && AP4_AES_KEY_LENGTH == 16 && defined(__AES__) && defined(__SSE4_2__)
1967+ #define AP4_ENABLE_AESNI
1968+
1969+ #include < immintrin.h>
1970+
1971+ #define cpuid (func,ax,bx,cx,dx )\
1972+ __asm__ __volatile__ (" cpuid" :\
1973+ " =a" (ax), " =b" (bx), " =c" (cx), " =d" (dx) : " a" (func));
1974+
1975+ static bool g_SupportAesNI = false ;
1976+ __attribute__ ((constructor)) static void detect_aesni ()
1977+ {
1978+ unsigned int a,b,c,d;
1979+ cpuid (1 , a,b,c,d);
1980+ g_SupportAesNI = c & 0x2000000 ;
1981+ }
1982+
1983+ #endif
1984+
1985+ #ifdef AP4_ENABLE_AESNI
1986+ template <AP4_Size NB>
1987+ static void aesni_process_NB_blocks (AP4_UI08* Out, AP4_UI08 const * In, __m128i& CB, AP4_UI08 const * Keys)
1988+ {
1989+ __m128i CBs[NB];
1990+ const __m128i CB_ = CB;
1991+ #pragma unroll
1992+ for (AP4_Size i = 0 ; i < NB; ++i) {
1993+ CBs[i] = _mm_add_epi64 (CB_, _mm_set_epi32 (0 ,i+1 ,0 ,0 ));
1994+ }
1995+ CB = CBs[NB-1 ];
1996+ const __m128i Bswap64 = _mm_set_epi8 (8 ,9 ,10 ,11 ,12 ,13 ,14 ,15 ,7 ,6 ,5 ,4 ,3 ,2 ,1 ,0 );
1997+ #pragma unroll
1998+ for (AP4_Size i = 0 ; i < NB; ++i) {
1999+ CBs[i] = _mm_shuffle_epi8 (CBs[i], Bswap64);
2000+ }
2001+
2002+ __m128i EncrCBs[NB];
2003+ const __m128i Key0 = _mm_loadu_si128 ((const __m128i*)Keys);
2004+ #pragma unroll
2005+ for (AP4_Size i = 0 ; i < NB; ++i) {
2006+ EncrCBs[i] = _mm_xor_si128 (CBs[i], Key0);
2007+ }
2008+ #pragma unroll
2009+ for (AP4_Size R = 1 ; R < 10 ; R++) {
2010+ const __m128i Key = _mm_loadu_si128 ((const __m128i*)&Keys[R*AP4_AES_KEY_LENGTH]);
2011+ #pragma unroll
2012+ for (AP4_Size i = 0 ; i < NB; ++i) {
2013+ EncrCBs[i] = _mm_aesenc_si128 (EncrCBs[i], Key);
2014+ }
2015+ }
2016+
2017+ const __m128i KeyLast = _mm_loadu_si128 ((const __m128i*)&Keys[10 *AP4_AES_KEY_LENGTH]);
2018+ #pragma unroll
2019+ for (AP4_Size i = 0 ; i < NB; ++i) {
2020+ EncrCBs[i] = _mm_aesenclast_si128 (EncrCBs[i],
2021+ _mm_xor_si128 (KeyLast, _mm_loadu_si128 ((const __m128i*)&In[i*AP4_AES_BLOCK_SIZE])));
2022+ }
2023+
2024+ #pragma unroll
2025+ for (AP4_Size i = 0 ; i < NB; ++i) {
2026+ _mm_storeu_si128 ((__m128i*)&Out[i*AP4_AES_BLOCK_SIZE], EncrCBs[i]);
2027+ }
2028+ }
2029+
2030+ static void process (AP4_UI08* output, AP4_UI08 const * input, AP4_Size input_size, const AP4_UI08* iv, const AP4_UI08* Keys)
2031+ {
2032+ union {
2033+ AP4_UI08 B[AP4_AES_BLOCK_SIZE];
2034+ __m128i V;
2035+ } Counter;
2036+
2037+ if (iv) {
2038+ memcpy (&Counter.B [0 ], iv, AP4_AES_BLOCK_SIZE);
2039+ Counter.V = _mm_shuffle_epi8 (Counter.V , _mm_set_epi8 (8 ,9 ,10 ,11 ,12 ,13 ,14 ,15 ,7 ,6 ,5 ,4 ,3 ,2 ,1 ,0 ));
2040+ Counter.V = _mm_sub_epi64 (Counter.V , _mm_set_epi32 (0 ,1 ,0 ,0 ));
2041+ } else {
2042+ memset (&Counter.B [0 ], 0 , AP4_AES_BLOCK_SIZE);
2043+ }
2044+
2045+ // First, process blocks eight by eight (Intel recommandation)
2046+ const AP4_Size Size8B = 8 *AP4_AES_BLOCK_SIZE;
2047+ const AP4_Size End8B = (input_size/Size8B)*Size8B;
2048+ for (AP4_Size i = 0 ; i < End8B; i += Size8B) {
2049+ aesni_process_NB_blocks<8 >(&output[i], &input[i], Counter.V , Keys);
2050+ }
2051+ // Process the remaining blocks!
2052+ const AP4_Size RemBlocks = (input_size-End8B)/AP4_AES_BLOCK_SIZE;
2053+ AP4_Size CurIdx = End8B;
2054+ switch (RemBlocks) {
2055+ #define FINAL_BLOCKS (N )\
2056+ case N:\
2057+ aesni_process_NB_blocks<N>(&output[CurIdx], &input[CurIdx], Counter.V , Keys);\
2058+ CurIdx += N*AP4_AES_BLOCK_SIZE;\
2059+ break ;
2060+
2061+ FINAL_BLOCKS (7 )
2062+ FINAL_BLOCKS (6 )
2063+ FINAL_BLOCKS (5 )
2064+ FINAL_BLOCKS (4 )
2065+ FINAL_BLOCKS (3 )
2066+ FINAL_BLOCKS (2 )
2067+ FINAL_BLOCKS (1 )
2068+ #undef FINAL_BLOCKS
2069+ }
2070+ const AP4_Size Rem = input_size-CurIdx;
2071+ assert (Rem < 16 && " too many remaining bytes!" );
2072+ if (Rem > 0 ) {
2073+ // Last block
2074+ AP4_UI08 LastBlock[AP4_AES_BLOCK_SIZE];
2075+ memcpy (&LastBlock[0 ], &input[CurIdx], Rem);
2076+ aesni_process_NB_blocks<1 >(&LastBlock[0 ], &LastBlock[0 ], Counter.V , Keys);
2077+ memcpy (&output[CurIdx], &LastBlock[0 ], Rem);
2078+ }
2079+ }
2080+
2081+ class AP4_AesNICtrBlockCipher : public AP4_AesBlockCipher
2082+ {
2083+ public:
2084+ AP4_AesNICtrBlockCipher (CipherDirection direction,
2085+ unsigned int counter_size,
2086+ aes_ctx* context) :
2087+ AP4_AesBlockCipher (direction, CTR, context)
2088+ {
2089+ assert (counter_size == 8 && " counter size must be 8 bytes!" );
2090+ }
2091+
2092+ // AP4_BlockCipher methods
2093+ virtual AP4_Result Process (const AP4_UI08* input,
2094+ AP4_Size input_size,
2095+ AP4_UI08* output,
2096+ const AP4_UI08* iv);
2097+ };
2098+
2099+ AP4_Result AP4_AesNICtrBlockCipher::Process (const AP4_UI08* input,
2100+ AP4_Size input_size,
2101+ AP4_UI08* output,
2102+ const AP4_UI08* iv)
2103+ {
2104+ assert (m_Context->n_rnd == 10 && " this only works for AES128!" );
2105+ process (output, input, input_size, iv, (const AP4_UI08*) &m_Context->k_sch [0 ]);
2106+ return AP4_SUCCESS;
2107+ }
2108+ #endif // AP4_ENABLE_AESNI
2109+
19662110/* ----------------------------------------------------------------------
19672111| AP4_AesBlockCipher::Create
19682112+---------------------------------------------------------------------*/
@@ -1994,7 +2138,16 @@ AP4_AesBlockCipher::Create(const AP4_UI08* key,
19942138 if (ctr_params) {
19952139 counter_size = ctr_params->counter_size ;
19962140 }
1997- cipher = new AP4_AesCtrBlockCipher (direction, counter_size, context);
2141+ #ifdef AP4_ENABLE_AESNI
2142+ if (g_SupportAesNI && (counter_size == 8 )) {
2143+ cipher = new AP4_AesNICtrBlockCipher (direction, counter_size, context);
2144+ }
2145+ else {
2146+ #endif
2147+ cipher = new AP4_AesCtrBlockCipher (direction, counter_size, context);
2148+ #ifdef AP4_ENABLE_AESNI
2149+ }
2150+ #endif
19982151 break ;
19992152 }
20002153
0 commit comments