Skip to content

Commit f2b676d

Browse files
authored
Merge pull request #371 from aguinet/feature/aesni
Use AESNI for AES-CTR if available
2 parents 0010f4f + 08941cc commit f2b676d

File tree

1 file changed

+154
-1
lines changed

1 file changed

+154
-1
lines changed

Source/C++/Crypto/Ap4AesBlockCipher.cpp

Lines changed: 154 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1964,6 +1964,150 @@ AP4_AesCtrBlockCipher::Process(const AP4_UI08* input,
19641964
return AP4_SUCCESS;
19651965
}
19661966

1967+
#if AP4_AES_BLOCK_SIZE == 16 && AP4_AES_KEY_LENGTH == 16 && defined(__AES__) && defined(__SSE4_2__)
1968+
#define AP4_ENABLE_AESNI
1969+
1970+
#include <immintrin.h>
1971+
1972+
#define cpuid(func,ax,bx,cx,dx)\
1973+
__asm__ __volatile__ ("cpuid":\
1974+
"=a" (ax), "=b" (bx), "=c" (cx), "=d" (dx) : "a" (func));
1975+
1976+
static bool g_SupportAesNI = false;
1977+
__attribute__((constructor)) static void detect_aesni()
1978+
{
1979+
unsigned int a,b,c,d;
1980+
cpuid(1, a,b,c,d);
1981+
g_SupportAesNI = c & 0x2000000;
1982+
}
1983+
1984+
#endif
1985+
1986+
#ifdef AP4_ENABLE_AESNI
1987+
template <AP4_Size NB>
1988+
static void aesni_process_NB_blocks(AP4_UI08* Out, AP4_UI08 const* In, __m128i& CB, AP4_UI08 const* Keys)
1989+
{
1990+
__m128i CBs[NB];
1991+
const __m128i CB_ = CB;
1992+
#pragma unroll
1993+
for (AP4_Size i = 0; i < NB; ++i) {
1994+
CBs[i] = _mm_add_epi64(CB_, _mm_set_epi32(0,i+1,0,0));
1995+
}
1996+
CB = CBs[NB-1];
1997+
const __m128i Bswap64 = _mm_set_epi8(8,9,10,11,12,13,14,15,7,6,5,4,3,2,1,0);
1998+
#pragma unroll
1999+
for (AP4_Size i = 0; i < NB; ++i) {
2000+
CBs[i] = _mm_shuffle_epi8(CBs[i], Bswap64);
2001+
}
2002+
2003+
__m128i EncrCBs[NB];
2004+
const __m128i Key0 = _mm_loadu_si128((const __m128i*)Keys);
2005+
#pragma unroll
2006+
for (AP4_Size i = 0; i < NB; ++i) {
2007+
EncrCBs[i] = _mm_xor_si128(CBs[i], Key0);
2008+
}
2009+
#pragma unroll
2010+
for (AP4_Size R = 1; R < 10; R++) {
2011+
const __m128i Key = _mm_loadu_si128((const __m128i*)&Keys[R*AP4_AES_KEY_LENGTH]);
2012+
#pragma unroll
2013+
for (AP4_Size i = 0; i < NB; ++i) {
2014+
EncrCBs[i] = _mm_aesenc_si128(EncrCBs[i], Key);
2015+
}
2016+
}
2017+
2018+
const __m128i KeyLast = _mm_loadu_si128((const __m128i*)&Keys[10*AP4_AES_KEY_LENGTH]);
2019+
#pragma unroll
2020+
for (AP4_Size i = 0; i < NB; ++i) {
2021+
EncrCBs[i] = _mm_aesenclast_si128(EncrCBs[i],
2022+
_mm_xor_si128(KeyLast, _mm_loadu_si128((const __m128i*)&In[i*AP4_AES_BLOCK_SIZE])));
2023+
}
2024+
2025+
#pragma unroll
2026+
for (AP4_Size i = 0; i < NB; ++i) {
2027+
_mm_storeu_si128((__m128i*)&Out[i*AP4_AES_BLOCK_SIZE], EncrCBs[i]);
2028+
}
2029+
}
2030+
2031+
static void process(AP4_UI08* output, AP4_UI08 const* input, AP4_Size input_size, const AP4_UI08* iv, const AP4_UI08* Keys)
2032+
{
2033+
union {
2034+
AP4_UI08 B[AP4_AES_BLOCK_SIZE];
2035+
__m128i V;
2036+
} Counter;
2037+
2038+
if (iv) {
2039+
memcpy(&Counter.B[0], iv, AP4_AES_BLOCK_SIZE);
2040+
Counter.V = _mm_shuffle_epi8(Counter.V, _mm_set_epi8(8,9,10,11,12,13,14,15,7,6,5,4,3,2,1,0));
2041+
Counter.V = _mm_sub_epi64(Counter.V, _mm_set_epi32(0,1,0,0));
2042+
} else {
2043+
memset(&Counter.B[0], 0, AP4_AES_BLOCK_SIZE);
2044+
}
2045+
2046+
// First, process blocks eight by eight (Intel recommandation)
2047+
const AP4_Size Size8B = 8*AP4_AES_BLOCK_SIZE;
2048+
const AP4_Size End8B = (input_size/Size8B)*Size8B;
2049+
for (AP4_Size i = 0; i < End8B; i += Size8B) {
2050+
aesni_process_NB_blocks<8>(&output[i], &input[i], Counter.V, Keys);
2051+
}
2052+
// Process the remaining blocks!
2053+
const AP4_Size RemBlocks = (input_size-End8B)/AP4_AES_BLOCK_SIZE;
2054+
AP4_Size CurIdx = End8B;
2055+
switch (RemBlocks) {
2056+
#define FINAL_BLOCKS(N)\
2057+
case N:\
2058+
aesni_process_NB_blocks<N>(&output[CurIdx], &input[CurIdx], Counter.V, Keys);\
2059+
CurIdx += N*AP4_AES_BLOCK_SIZE;\
2060+
break;
2061+
2062+
FINAL_BLOCKS(7)
2063+
FINAL_BLOCKS(6)
2064+
FINAL_BLOCKS(5)
2065+
FINAL_BLOCKS(4)
2066+
FINAL_BLOCKS(3)
2067+
FINAL_BLOCKS(2)
2068+
FINAL_BLOCKS(1)
2069+
#undef FINAL_BLOCKS
2070+
}
2071+
const AP4_Size Rem = input_size-CurIdx;
2072+
assert(Rem < 16 && "too many remaining bytes!");
2073+
if (Rem > 0) {
2074+
// Last block
2075+
AP4_UI08 LastBlock[AP4_AES_BLOCK_SIZE];
2076+
memcpy(&LastBlock[0], &input[CurIdx], Rem);
2077+
aesni_process_NB_blocks<1>(&LastBlock[0], &LastBlock[0], Counter.V, Keys);
2078+
memcpy(&output[CurIdx], &LastBlock[0], Rem);
2079+
}
2080+
}
2081+
2082+
class AP4_AesNICtrBlockCipher : public AP4_AesBlockCipher
2083+
{
2084+
public:
2085+
AP4_AesNICtrBlockCipher(CipherDirection direction,
2086+
unsigned int counter_size,
2087+
aes_ctx* context) :
2088+
AP4_AesBlockCipher(direction, CTR, context)
2089+
{
2090+
assert(counter_size == 8 && "counter size must be 8 bytes!");
2091+
}
2092+
2093+
// AP4_BlockCipher methods
2094+
virtual AP4_Result Process(const AP4_UI08* input,
2095+
AP4_Size input_size,
2096+
AP4_UI08* output,
2097+
const AP4_UI08* iv);
2098+
};
2099+
2100+
AP4_Result AP4_AesNICtrBlockCipher::Process(const AP4_UI08* input,
2101+
AP4_Size input_size,
2102+
AP4_UI08* output,
2103+
const AP4_UI08* iv)
2104+
{
2105+
assert(m_Context->n_rnd == 10 && "this only works for AES128!");
2106+
process(output, input, input_size, iv, (const AP4_UI08*) &m_Context->k_sch[0]);
2107+
return AP4_SUCCESS;
2108+
}
2109+
#endif // AP4_ENABLE_AESNI
2110+
19672111
/*----------------------------------------------------------------------
19682112
| AP4_AesBlockCipher::Create
19692113
+---------------------------------------------------------------------*/
@@ -1995,7 +2139,16 @@ AP4_AesBlockCipher::Create(const AP4_UI08* key,
19952139
if (ctr_params) {
19962140
counter_size = ctr_params->counter_size;
19972141
}
1998-
cipher = new AP4_AesCtrBlockCipher(direction, counter_size, context);
2142+
#ifdef AP4_ENABLE_AESNI
2143+
if (g_SupportAesNI && (counter_size == 8)) {
2144+
cipher = new AP4_AesNICtrBlockCipher(direction, counter_size, context);
2145+
}
2146+
else {
2147+
#endif
2148+
cipher = new AP4_AesCtrBlockCipher(direction, counter_size, context);
2149+
#ifdef AP4_ENABLE_AESNI
2150+
}
2151+
#endif
19992152
break;
20002153
}
20012154

0 commit comments

Comments
 (0)