@@ -163,9 +163,9 @@ NOTE: String length must be evenly divisible by 16byte (str_len % 16 == 0)
163163// jcallan@github points out that declaring Multiply as a function
164164// reduces code size considerably with the Keil ARM compiler.
165165// See this link for more information: https://github.com/kokke/tiny-AES-C/pull/3
166- #ifndef MULTIPLY_AS_A_FUNCTION
167- #define MULTIPLY_AS_A_FUNCTION 0
168- #endif
166+ // #ifndef MULTIPLY_AS_A_FUNCTION /*Multiply is removed*/
167+ // #define MULTIPLY_AS_A_FUNCTION 0
168+ // #endif
169169
170170
171171
@@ -394,36 +394,18 @@ static void AddRoundKey(uint32_t round, state_t* state, const uint8_t* RoundKey)
394394// state matrix with values in an S-box.
395395static void SubBytes (state_t * state )
396396{
397- #if 1
398397 unsigned int i ;
399398 for (i = 0 ; i < 4 ; i ++ )
400399 {
401400 uint32_t * pLine = ((uint32_t * )state + i );
402401 uint32_t line = * pLine ;
403402
404- uint32_t byte0 = (line & MASK32_BYTE0 ) >> OFS32_BYTE0 ;
405- uint32_t byte1 = (line & MASK32_BYTE1 ) >> OFS32_BYTE1 ;
406- uint32_t byte2 = (line & MASK32_BYTE2 ) >> OFS32_BYTE2 ;
407- uint32_t byte3 = (line & MASK32_BYTE3 ) >> OFS32_BYTE3 ;
408-
409403 * pLine = (getSBoxValue ((line & MASK32_BYTE0 ) >> OFS32_BYTE0 ) << OFS32_BYTE0 ) |
410404 (getSBoxValue ((line & MASK32_BYTE1 ) >> OFS32_BYTE1 ) << OFS32_BYTE1 ) |
411405 (getSBoxValue ((line & MASK32_BYTE2 ) >> OFS32_BYTE2 ) << OFS32_BYTE2 ) |
412406 (getSBoxValue ((line & MASK32_BYTE3 ) >> OFS32_BYTE3 ) << OFS32_BYTE3 );
413407 }
414408
415- #else
416- unsigned int i , j ;
417- for (i = 0 ; i < 4 ; ++ i )
418- {
419- for (j = 0 ; j < 4 ; ++ j )
420- {
421- (* state )[j ][i ] = getSBoxValue ((* state )[j ][i ]);
422- //(*state)[i][j] = getSBoxValue((*state)[i][j]);
423- }
424- }
425-
426- #endif
427409
428410}
429411
@@ -505,6 +487,7 @@ static void ShiftRows(state_t* state)
505487#endif
506488}
507489
490+ #if 0 /*removing because xtime has been redefined*/
508491#ifndef _DEBUG
509492static inline uint32_t xtime (uint32_t x )
510493{
@@ -517,56 +500,40 @@ static inline uint64_t xtime64(uint64_t x)
517500#else
518501#define xtime (x ) ((((x)<<1) ^ ((((x)>>7)) * 0x1b)) & 0xFF)
519502#endif
503+ #endif
520504
505+ /* xtime
506+ * original xtime function operated on bytes, this optimized version performs the same
507+ * calculation on all bytes in a dword simultaneously, reducing operations and memory accesses
508+ */
509+ static inline uint32_t xtime (uint32_t x )
510+ {
511+ return ((x & 0x7f7f7f7f )<<1 )^(((x & 0x80808080 )>>7 )* 0x1b );
512+ }
513+ static inline uint64_t xtime64 (uint64_t x )
514+ {
515+ return ((x << 1 ) ^ (((x >> 7 )/* & 1*/ ) * 0x1b )) & 0xFF ;
516+ }
521517
522-
523- // MixColumns function mixes the columns of the state matrix
518+ /* MixColumns
519+ * replaced byte-wise operations with word-based operations
520+ * eliminated repeated calculations
521+ */
524522static void MixColumns (state_t * state )
525523{
526- unsigned int i ;
527- for (i = 0 ; i < 4 ; ++ i )
528- {
529-
530- #if 1
531- uint32_t * pLine = ((uint32_t * )state + i );
532- uint32_t line = * pLine ;
533-
534- uint32_t byte0 = (line & MASK32_BYTE0 ) >> OFS32_BYTE0 ;
535- uint32_t byte1 = (line & MASK32_BYTE1 ) >> OFS32_BYTE1 ;
536- uint32_t byte2 = (line & MASK32_BYTE2 ) >> OFS32_BYTE2 ;
537- uint32_t byte3 = (line & MASK32_BYTE3 ) >> OFS32_BYTE3 ;
538-
539- uint32_t t = byte0 ;
540- uint32_t Tmp = byte0 ^ byte1 ^ byte2 ^ byte3 ;
541- byte0 ^= xtime (byte0 ^ byte1 ) ^ Tmp ;
542- byte1 ^= xtime (byte1 ^ byte2 ) ^ Tmp ;
543- byte2 ^= xtime (byte2 ^ byte3 ) ^ Tmp ;
544- byte3 ^= xtime (byte3 ^ t ) ^ Tmp ;
545-
546- * pLine = (byte0 << OFS32_BYTE0 ) | (byte1 << OFS32_BYTE1 ) | (byte2 << OFS32_BYTE2 ) | (byte3 << OFS32_BYTE3 );
547- #else
548- uint32_t Tm ;
549- /* GK
550-
551- t = (*state)[i][0];
552- Tmp = (*state)[i][0] ^ (*state)[i][1] ^ (*state)[i][2] ^ (*state)[i][3] ;
553- Tm = (*state)[i][0] ^ (*state)[i][1] ; Tm = xtime(Tm); (*state)[i][0] ^= Tm ^ Tmp ;
554- Tm = (*state)[i][1] ^ (*state)[i][2] ; Tm = xtime(Tm); (*state)[i][1] ^= Tm ^ Tmp ;
555- Tm = (*state)[i][2] ^ (*state)[i][3] ; Tm = xtime(Tm); (*state)[i][2] ^= Tm ^ Tmp ;
556- Tm = (*state)[i][3] ^ t ; Tm = xtime(Tm); (*state)[i][3] ^= Tm ^ Tmp ;
557- */
558- // GK - slightly more optimal and simple
559- t = (* state )[i ][0 ];
560- Tmp = (* state )[i ][0 ] ^ (* state )[i ][1 ] ^ (* state )[i ][2 ] ^ (* state )[i ][3 ];
561- Tm = xtime ((* state )[i ][0 ] ^ (* state )[i ][1 ]); (* state )[i ][0 ] ^= Tm ^ Tmp ;
562- Tm = xtime ((* state )[i ][1 ] ^ (* state )[i ][2 ]); (* state )[i ][1 ] ^= Tm ^ Tmp ;
563- Tm = xtime ((* state )[i ][2 ] ^ (* state )[i ][3 ]); (* state )[i ][2 ] ^= Tm ^ Tmp ;
564- Tm = xtime ((* state )[i ][3 ] ^ t ); (* state )[i ][3 ] ^= Tm ^ Tmp ;
565-
566- #endif
567- }
524+ unsigned int * sp = (unsigned int * ) state ;
525+
526+ for (int i = 4 ;i ;-- i ,sp ++ )
527+ * sp = xtime ((* sp ) ^ (((* sp )>>8 )|((* sp )<<24 ))) ^
528+ (((* sp )<<8 )|((* sp )>>24 )) ^
529+ (((* sp )<<16 )|((* sp )>>16 )) ^ (((* sp )<<24 )|((* sp )>>8 ));
568530}
569531
532+ /*removed old Multiply function - now integrated into InvMixColumns */
533+ // Multiply is used to multiply numbers in the field GF(2^8)
534+ // Note: The last call to xtime() is unneeded, but often ends up generating a smaller binary
535+ // The compiler seems to be able to vectorize the operation better this way.
536+ // See https://github.com/kokke/tiny-AES-c/pull/34
570537// Multiply is used to multiply numbers in the field GF(2^8)
571538// Note: The last call to xtime() is unneeded, but often ends up generating a smaller binary
572539// The compiler seems to be able to vectorize the operation better this way.
@@ -575,118 +542,99 @@ static void MixColumns(state_t* state)
575542#ifndef _DEBUG
576543static inline uint32_t Multiply (uint32_t x , uint32_t y )
577544{
578- uint32_t xtimeX = xtime (x );
579- uint32_t xtimeXX = xtime (xtimeX );
580- uint32_t xtimeXXX = xtime (xtimeXX );
581-
582- return ((~((y & 1 )- 1 ) & x ) ^
583- (~((y >>1 & 1 )- 1 ) & xtimeX ) ^
584- (~((y >>2 & 1 )- 1 ) & xtimeXX ) ^
585- (~((y >>3 & 1 )- 1 ) & xtimeXXX )
545+ uint32_t xtimeX = xtime (x );
546+ uint32_t xtimeXX = xtime (xtimeX );
547+ uint32_t xtimeXXX = xtime (xtimeXX );
548+
549+ return ((~((y & 1 )- 1 ) & x ) ^
550+ (~((y >>1 & 1 )- 1 ) & xtimeX ) ^
551+ (~((y >>2 & 1 )- 1 ) & xtimeXX ) ^
552+ (~((y >>3 & 1 )- 1 ) & xtimeXXX )
586553#if defined(_MSC_VER ) && defined (_M_AMD64 )
587- ^
554+ ^
588555 (~((y >>4 & 1 )- 1 ) & xtime (xtimeXXX ))
589556#endif
590- ); /* this last call to xtime() can be omitted */
591- }
592-
593- static inline uint64_t Multiply64 (uint64_t x , uint64_t y )
594- {
595- uint64_t xtimeX = xtime64 (x );
596- uint64_t xtimeXX = xtime64 (xtimeX );
597- uint64_t xtimeXXX = xtime64 (xtimeXX );
598-
599- return ((~((y & 1 ) - 1 ) & x ) ^
600- (~((y >> 1 & 1 ) - 1 ) & xtimeX ) ^
601- (~((y >> 2 & 1 ) - 1 ) & xtimeXX ) ^
602- (~((y >> 3 & 1 ) - 1 ) & xtimeXXX )
603- #if defined(_MSC_VER ) && defined (_M_AMD64 )
604- ^
605- (~((y >> 4 & 1 ) - 1 ) & xtime64 (xtimeXXX ))
606- #endif
607- ); /* this last call to xtime() can be omitted */
557+ ); /* this last call to xtime() can be omitted */
608558}
559+
609560#else
610561#define Multiply (x , y ) \
611562 ((~((y & 1)-1) & x) ^ \
612563 (~((y>>1 & 1)-1) & xtime(x)) ^ \
613564 (~((y>>2 & 1)-1) & xtime(xtime(x))) ^ \
614565 (~((y>>3 & 1)-1) & xtime(xtime(xtime(x)))) ) \
615566
616- #define Multiply64 (x , y ) \
617- ((~((y & 1)-1) & x) ^ \
618- (~((y>>1 & 1)-1) & xtime(x)) ^ \
619- (~((y>>2 & 1)-1) & xtime(xtime(x))) ^ \
620- (~((y>>3 & 1)-1) & xtime(xtime(xtime(x)))) ) \
621567
622568#endif
623569
624570#if (defined(CBC ) && CBC == 1 ) || (defined(ECB ) && ECB == 1 )
625571// MixColumns function mixes the columns of the state matrix.
626572// The method used to multiply may be difficult to understand for the inexperienced.
627573// Please use the references to gain more information.
574+ /* InvMixColumns
575+ * this a more optimal version which performs parallel computation on all bytes in
576+ * a dword and applies the Multiply & xor and unrolls the multiple calls to Multiply
577+ * for each 0x9,0xb,0xd and 0xe perturbation
578+ */
628579static void InvMixColumns (state_t * state )
629580{
630- int i ;
631- #if (USE32_ARITHMETIC == 1 )
632- for (i = 0 ; i < 4 ; ++ i )
633- {
634- uint32_t * pLine = ((uint32_t * )state + i );
635- uint32_t line = * pLine ;
636-
637- uint32_t byte0 = (line & MASK32_BYTE0 ) >> OFS32_BYTE0 ;
638- uint32_t byte1 = (line & MASK32_BYTE1 ) >> OFS32_BYTE1 ;
639- uint32_t byte2 = (line & MASK32_BYTE2 ) >> OFS32_BYTE2 ;
640- uint32_t byte3 = (line & MASK32_BYTE3 ) >> OFS32_BYTE3 ;
641-
642- line = (Multiply (byte0 , 0x0e ) ^ Multiply (byte1 , 0x0b ) ^ Multiply (byte2 , 0x0d ) ^ Multiply (byte3 , 0x09 )) << OFS32_BYTE0 ;
643- line |= (Multiply (byte0 , 0x09 ) ^ Multiply (byte1 , 0x0e ) ^ Multiply (byte2 , 0x0b ) ^ Multiply (byte3 , 0x0d )) << OFS32_BYTE1 ;
644- line |= (Multiply (byte0 , 0x0d ) ^ Multiply (byte1 , 0x09 ) ^ Multiply (byte2 , 0x0e ) ^ Multiply (byte3 , 0x0b )) << OFS32_BYTE2 ;
645- line |= (Multiply (byte0 , 0x0b ) ^ Multiply (byte1 , 0x0d ) ^ Multiply (byte2 , 0x09 ) ^ Multiply (byte3 , 0x0e )) << OFS32_BYTE3 ;
581+ #if !defined(_M_X64 ) // This approach runs a bit faster on arm64-v8a and possibly others
582+ for (int i = 0 ; i < 4 ; ++ i )
583+ {
584+ uint32_t * pLine = ((uint32_t * ) state + i );
585+ uint32_t line = * pLine ;
646586
647- * pLine = line ;
587+ uint32_t byte0 = (line & MASK32_BYTE0 ) >> OFS32_BYTE0 ;
588+ uint32_t byte1 = (line & MASK32_BYTE1 ) >> OFS32_BYTE1 ;
589+ uint32_t byte2 = (line & MASK32_BYTE2 ) >> OFS32_BYTE2 ;
590+ uint32_t byte3 = (line & MASK32_BYTE3 ) >> OFS32_BYTE3 ;
648591
592+ line = (Multiply (byte0 , 0x0e ) ^ Multiply (byte1 , 0x0b ) ^ Multiply (byte2 , 0x0d ) ^
593+ Multiply (byte3 , 0x09 )) << OFS32_BYTE0 ;
594+ line |= (Multiply (byte0 , 0x09 ) ^ Multiply (byte1 , 0x0e ) ^ Multiply (byte2 , 0x0b ) ^
595+ Multiply (byte3 , 0x0d )) << OFS32_BYTE1 ;
596+ line |= (Multiply (byte0 , 0x0d ) ^ Multiply (byte1 , 0x09 ) ^ Multiply (byte2 , 0x0e ) ^
597+ Multiply (byte3 , 0x0b )) << OFS32_BYTE2 ;
598+ line |= (Multiply (byte0 , 0x0b ) ^ Multiply (byte1 , 0x0d ) ^ Multiply (byte2 , 0x09 ) ^
599+ Multiply (byte3 , 0x0e )) << OFS32_BYTE3 ;
649600
650- /* GK
651- a = (*state)[i][0];
652- b = (*state)[i][1];
653- c = (*state)[i][2];
654- d = (*state)[i][3];
655-
656- (*state)[i][0] = Multiply(a, 0x0e) ^ Multiply(b, 0x0b) ^ Multiply(c, 0x0d) ^ Multiply(d, 0x09);
657- (*state)[i][1] = Multiply(a, 0x09) ^ Multiply(b, 0x0e) ^ Multiply(c, 0x0b) ^ Multiply(d, 0x0d);
658- (*state)[i][2] = Multiply(a, 0x0d) ^ Multiply(b, 0x09) ^ Multiply(c, 0x0e) ^ Multiply(d, 0x0b);
659- (*state)[i][3] = Multiply(a, 0x0b) ^ Multiply(b, 0x0d) ^ Multiply(c, 0x09) ^ Multiply(d, 0x0e);
660- */
661- }
662- #else
663- for (i = 0 ; i < 2 ; ++ i )
664- {
665- uint64_t * pDoubleLine = ((uint64_t * )state + i );
666- uint64_t doubleLine = * pDoubleLine ;
667-
668- uint64_t byte0 = (doubleLine & MASK64_BYTE0 ) >> OFS64_BYTE0 ;
669- uint64_t byte1 = (doubleLine & MASK64_BYTE1 ) >> OFS64_BYTE1 ;
670- uint64_t byte2 = (doubleLine & MASK64_BYTE2 ) >> OFS64_BYTE2 ;
671- uint64_t byte3 = (doubleLine & MASK64_BYTE3 ) >> OFS64_BYTE3 ;
672-
673- uint64_t result = (Multiply64 (byte0 , 0x0e ) ^ Multiply64 (byte1 , 0x0b ) ^ Multiply64 (byte2 , 0x0d ) ^ Multiply64 (byte3 , 0x09 )) << OFS64_BYTE0 ;
674- result |= (Multiply64 (byte0 , 0x09 ) ^ Multiply64 (byte1 , 0x0e ) ^ Multiply64 (byte2 , 0x0b ) ^ Multiply64 (byte3 , 0x0d )) << OFS64_BYTE1 ;
675- result |= (Multiply64 (byte0 , 0x0d ) ^ Multiply64 (byte1 , 0x09 ) ^ Multiply64 (byte2 , 0x0e ) ^ Multiply64 (byte3 , 0x0b )) << OFS64_BYTE2 ;
676- result |= (Multiply64 (byte0 , 0x0b ) ^ Multiply64 (byte1 , 0x0d ) ^ Multiply64 (byte2 , 0x09 ) ^ Multiply64 (byte3 , 0x0e )) << OFS64_BYTE3 ;
677-
678- byte0 = (doubleLine & MASK64_BYTE4 ) >> OFS64_BYTE4 ;
679- byte1 = (doubleLine & MASK64_BYTE5 ) >> OFS64_BYTE5 ;
680- byte2 = (doubleLine & MASK64_BYTE6 ) >> OFS64_BYTE6 ;
681- byte3 = (doubleLine & MASK64_BYTE7 ) >> OFS64_BYTE7 ;
682-
683- result |= (Multiply64 (byte0 , 0x0e ) ^ Multiply64 (byte1 , 0x0b ) ^ Multiply64 (byte2 , 0x0d ) ^ Multiply64 (byte3 , 0x09 )) << OFS64_BYTE4 ;
684- result |= (Multiply64 (byte0 , 0x09 ) ^ Multiply64 (byte1 , 0x0e ) ^ Multiply64 (byte2 , 0x0b ) ^ Multiply64 (byte3 , 0x0d )) << OFS64_BYTE5 ;
685- result |= (Multiply64 (byte0 , 0x0d ) ^ Multiply64 (byte1 , 0x09 ) ^ Multiply64 (byte2 , 0x0e ) ^ Multiply64 (byte3 , 0x0b )) << OFS64_BYTE6 ;
686- result |= (Multiply64 (byte0 , 0x0b ) ^ Multiply64 (byte1 , 0x0d ) ^ Multiply64 (byte2 , 0x09 ) ^ Multiply64 (byte3 , 0x0e )) << OFS64_BYTE7 ;
687-
688- * pDoubleLine = result ;
689- }
601+ * pLine = line ;
602+ }
603+ #else // This way is more efficient on the x64 Intel/AMD architecture
604+ uint32_t * sp = (uint32_t * )state ;
605+ uint32_t xtimeX ;
606+ uint32_t xtimeXX ;
607+ uint32_t xtimeXXX ;
608+ uint32_t xtime_x9 ;
609+ uint32_t xtime_xb ;
610+ uint32_t xtime_xd ;
611+ uint32_t xtime_xe ;
612+ //*sp++ = i;
613+
614+ for (int i = 4 ; i ; -- i , sp ++ )
615+ {
616+ uint32_t spVal = * sp ;
617+ xtimeX = xtime (spVal );
618+ xtimeXX = xtime (xtimeX );
619+ xtimeXXX = xtime (xtimeXX );
620+
621+ xtime_x9 = xtimeXXX ^ spVal ;
622+ xtime_xb = xtimeXXX ^ xtimeX ^ spVal ;
623+ xtime_xd = xtimeXXX ^ xtimeXX ^ spVal ;
624+ xtime_xe = xtimeXXX ^ xtimeXX ^ xtimeX ;
625+
626+ uint32_t xtime_xb_r8 = xtime_xb >> 8 ;
627+ uint32_t xtime_xd_r16 = xtime_xd >> 16 ;
628+ uint32_t xtime_x9_l8 = xtime_x9 << 8 ;
629+ uint32_t xtime_xd_l16 = xtime_xd << 16 ;
630+
631+ //this next assignment incorporates all of the Multiply calls, eliminating the repeated re-calculations
632+ * sp =
633+ /* byte 0:*/ (((xtime_xe ^ xtime_xb_r8 ^ xtime_xd_r16 ^ (xtime_x9 >> 24 )) & 0x000000ff ) |
634+ /* byte 1:*/ ((xtime_x9_l8 ^ xtime_xe ^ xtime_xb_r8 ^ xtime_xd_r16 ) & 0x0000ff00 ) |
635+ /* byte 2:*/ ((xtime_xd_l16 ^ xtime_x9_l8 ^ xtime_xe ^ xtime_xb_r8 ) & 0x00ff0000 ) |
636+ /* byte 3:*/ (((xtime_xb << 24 ) ^ xtime_xd_l16 ^ xtime_x9_l8 ^ xtime_xe ) & 0xff000000 ));
637+ }
690638#endif
691639}
692640
0 commit comments