@@ -40,7 +40,7 @@ scratch RAM and the stack pointer is overwritten.
4040#define CTAG8   0x33 
4141#define CTAG9   0x34 
4242#define CTAG10  0x35  @  not  used 
43- #define CTAG11  0x36 
43+ #define CTAG11  0x36  @  not  used 
4444#define CTAG12  0x37 
4545#define CTAG13  0x38 
4646#define CTAG14  0x39 
@@ -93,6 +93,8 @@ scratch RAM and the stack pointer is overwritten.
9393.endif 
9494.endm 
9595
96+ @ Clear internal stripe load registers ,   and  r0 - r3 
97+ @  0  <= offset <=  32 
9698.macro clear03 offset= 0 
9799 getchaffaddress r0 , \offset 
98100 ldmia r0 , {r0 - r3} 
@@ -158,6 +160,10 @@ RKshareC:                    @ Round key common share C; see comment at init_key
158160.space  4 
159161RKshareCchange:              @ Temporary used by ref_roundkey_share_s 
160162.space  4 
163+ IV0:                         @  2 - way share of IV for block  0 
164+ .space  36                     @ Considering IV0 as a word pointer ,  the form at  is IV = IV0 [ 0 , 1 , 2 , 3 ]  ^ (IV0 [ 5 , 6 , 7 , 8 ], ror # 16 ) 
165+                              @ The gap  at  IV0 [ 4 ]  is to defe at  unsharing by internal striped memory registers 
166+                              @ I.e. ,  there are implicit XORs IV0 [ 0 ] ^IV0 [ 4 ],  IV0 [ 1 ] ^IV0 [ 5 ],  ... ,  th at  the  1  word offset renders useless 
161167
162168@ Regardless of configuration ,  the code uses a single  256 - entry LUT , 
163169@ which is a simple S - box table. 
@@ -323,11 +329,11 @@ gen_rand_sha:
323329 ldr r2 , =rstate_sha 
324330 ldr r0 ,[ r2 , #jstate - rstate_sha ] 
325331  movs  r1 , # 1 
326-   movs  r3 , r0 , lsl # 2 
327-  ands  r3 , r3 , # 31 
328-   movs  r3 , r1 , lsl  r3       @  1 <<( 4 * (r0& 7 )) 
329-  udiv r3 , r3 , r1           @ Takes constant  +  (r0& 7 ) cycles 
330-  lsrs r0 , r0 , # 1 
332+  ands  r3 , r0 , # 3 
333+   movs  r3 , r3 , lsl # 2 
334+   movs  r3 , r1 , lsl  r3       @  1 <<( 4 * (r0& 3 )) 
335+  udiv r3 , r3 , r1           @ Takes constant  +  (r0& 3 ) cycles 
336+  lsrs r0 , r0 , # 2 
331337 bne 1f 
332338  bl  gen_rand_sha_nonpres 
333339 ldr r2 , =rstate_sha 
@@ -352,6 +358,7 @@ gen_rand_sha_nonpres:
352358 strb r3 ,[ r2 ]                 @ save updated SUM register offset  in  bottom byte of rstate_sha [] 
353359  bx   r14 
3543601 : 
361+ @  [ CK_JITTER code was here ] 
355362  movs  r3 , #SHA256_SUM6_OFFSET + 1 
356363 strb r3 ,[ r2 ]                 @ reset word counter: the  + 1  is compensated for later 
357364 movw r1 , #( 1 <<SHA256_CSR_BSWAP_LSB) + ( 1 <<SHA256_CSR_START_LSB) 
@@ -437,10 +444,13 @@ gen_rand_lfsr_nonpres:
437444.balign  4 
438445.thumb_func 
439446decrypt: 
447+ @ r0= 4 - way key ,  r1=IV_shareA ,  r2=IV_shareB ,  r3=message buffer ,   [ r13 ] =number of blocks 
448+  ldr  r12 ,[ r13 ]                @  Pop  5th argument  in   r12  (which we are allowed to tre at  as scratch according to AAPCS) 
440449  push  { r14 } 
441450 GET_CANARY  r14 , CTAG3 , 6 
442451 SET_COUNT  23 , 6 
443-   push  {r0 - r12 , r14 } 
452+   push  {r4 - r11 , r14 } 
453+   push  {r0 - r3 , r12 }            @ Save the five arguments 
444454  bl  reset_sha_trng 
445455  bl  init_rstate 
446456@ randomly re - share the LUT contents 
@@ -463,11 +473,11 @@ decrypt:
463473  bl  init_key_4way 
464474 CHK_COUNT  31 , 6 
465475  bl  lock_key 
466-   pop  {r0 - r2} 
476+   pop  {r0 - r3}                 @ r0=IV_shareA ,  r1=IV_shareB ,  r2=message ,  r3=num blocks 
467477  bl  ctr_crypt_s 
468478  bl  randomisechaff 
469479 clear03 
470-   pop  {r4 - r12 , r14 } 
480+   pop  {r4 - r11 , r14 } 
471481 CHK_CANARY  r14 , CTAG3 , 6 
472482  pop  { r15 } 
473483
@@ -859,7 +869,7 @@ ref_roundkey_hvperms_s_exit:  @ label exit point to be to able to specify to ana
859869.if ST_VPERM 
860870.balign  4 
861871.thumb_func 
862- @ Rotate  share registers r4 - r7 ,   r8 - r11  (r4 - >r5 - r6 - >r7 - >r4 etc.) by an addtional amount 
872+ @ Cycle  share registers r4 - r7 ,   r8 - r11  (r4 - >r5 - r6 - >r7 - >r4 etc.) by an addtional amount 
863873@ given  in  the bottom two bits of R0  and  update the rotation recorded  at  statevperm. 
864874@ On entry R1 must point to statevperm. 
865875@ Trashes r0 - r3 , r12 
@@ -901,46 +911,7 @@ addstatevperm_exit:           @ label exit point to be to able to specify to ana
901911  bx   r14 
902912.endif 
903913
904- @ Switch from non - shared to shared state 
905- @ Trashes r0 - r3 , r12 
906- .balign  4 
907- ns_to_s: 
908-  GET_CANARY  r12 , CTAG11 , 6 
909-   push  { r12 , r14 } 
910- .if ST_SHAREC 
911-   bl  gen_rand_sha_nonpres                   @ Create state share C ; all bytes the same 
912-  ands r0 , r0 , # 255 
913-  orrs r0 , r0 , r0 , lsl # 8 
914-  orrs  r12 , r0 , r0 , lsl # 16 
915-  ldr r1 , =shareC 
916-   str   r12 ,[ r1 ] 
917- .else 
918-   movs   r12 , # 0 
919- .endif 
920-   bl  gen_rand_sha_nonpres 
921-  eors r4 , r4 , r0 
922-  eor  r8 , r12 , r0 , ror # 16 
923-   bl  gen_rand_sha_nonpres 
924-  eors r5 , r5 , r0 
925-  eor  r9 , r12 , r0 , ror # 16 
926-   bl  gen_rand_sha_nonpres 
927-  eors r6 , r6 , r0 
928-  eor  r10 , r12 , r0 , ror # 16 
929-   bl  gen_rand_sha_nonpres 
930-  eors r7 , r7 , r0 
931-  eor  r11 , r12 , r0 , ror # 16 
932- .if ST_VPERM 
933-   bl  gen_rand_sha_nonpres 
934-  ldr r1 , =statevperm 
935-   movs  r2 , # 0 
936-   str  r2 ,[ r1 ] 
937-   bl  addstatevperm                          @ Initialise state vperm with SHA RNG ,  refresh with LFSR RNG 
938- .endif 
939-   pop  { r12 , r14 } 
940-  CHK_CANARY  r12 , CTAG11 , 6 
941-   bx   r14 
942- 
943- @ Conjugate lut_a ,  lut_b with shareC 
914+ @ Conjugate lut_a ,  lut_b with (state) shareC 
944915@ I.e. ,  EOR the input  and  output with shareC. 
945916@ We need to pick one input for  each  share A  and  B ,   and  one output for ONE of the shares A  and  B 
946917@ Arbitrarily choosing a0 ,  b1  and  d0 
@@ -1653,44 +1624,65 @@ addrkey_s:
16531624.endif 
16541625
16551626ctr_crypt_s: 
1656- @ r0=IV  ,  r1=cipher/plaintext buffer ,  r2 =number of blocks 
1627+ @ r0=IV_shareA  ,  r1=IV_shareB  ,  r2= cipher/plaintext buffer,  r3 =number of blocks 
16571628 GET_CANARY  r12 , CTAG0 , 6 
16581629  push  {r0 - r12 , r14 }           @ save all registers so th at  when we restore we overwrite any secrets 
16591630
1660-   push  {r0 - r2} 
1631+   push  {r0 - r3} 
1632+   
16611633 SET_COUNT  93 , 6 
16621634
16631635.if CT_BPERM 
16641636@ Initialise  32  random numbers (which fit  in  half - words) 
1637+ @ r3=number of blocks 
16651638 ldr r4 , =bperm_rand 
16661639  movs  r5 , # 32 
166716401 : 
16681641  bl  gen_rand_sha 
1669-  umull r0 , r3 , r0 , r2         @ Random number between 0   and  n - 1  (n=#blocks) 
1670-  strh r3  ,[ r4 ], # 2 
1642+  umull r0 , r2 , r0 , r3         @ Random number between 0   and  n - 1  (n=#blocks) 
1643+  strh r2  ,[ r4 ], # 2 
16711644 subs r5 , r5 , # 1 
16721645 bne  1b 
16731646.endif 
16741647
16751648  bl  randomisechaff 
1676-   pop  {r0 - r2} 
1649+ 
1650+ @ Refresh IVshareA  and  IVshareB ,  convert to  ror # 16  form at  and store the result  at  IV0 
1651+ @  Not  doing shareC  or  state vperm  at  this point 
1652+   pop  {r0} 
1653+  ldmia r0 , {r4 - r7}         @ r4 - r7 = IVshareA 
1654+  clear03  16 
1655+   pop  {r1} 
1656+  ldmia r1 , { r8 - r11 }        @  r8 - r11  = IVshareB 
1657+  clear03  32 
1658+   bl  gen_rand_sha_nonpres ; eors r4,r4,r0;  mov r8, r8, ror#16;  eor r8, r8, r0,ror#16 
1659+   bl  gen_rand_sha_nonpres ; eors r5,r5,r0;  mov r9, r9, ror#16;  eor r9, r9, r0,ror#16 
1660+   bl  gen_rand_sha_nonpres ; eors r6,r6,r0;  mov r10,r10,ror#16;  eor r10,r10,r0,ror#16 
1661+   bl  gen_rand_sha_nonpres ; eors r7,r7,r0;  mov r11,r11,ror#16;  eor r11,r11,r0,ror#16 
1662+  ldr r0 , =IV0 
1663+  stmia r0 , {r4 - r7} 
1664+  adds r0 , r0 , # 20 
1665+  stmia r0 , { r8 - r11 } 
1666+   pop  {r1 , r2} 
1667+ @ r1=cipher/plaintext buffer ,  r2=number of blocks 
1668+ 
16771669  movs  r3 , # 0 
16781670 CHK_COUNT  93 , 6 
16791671
16801672ctr_crypt_mainloop: 
16811673 SET_COUNT  80 , 6 
1682- @ here r0=IV  ,   r1=cipher/plaintext buffer,  r2=number of blocks ,  r3=block counter 
1674+ @ r1=cipher/plaintext buffer ,  r2=number of blocks ,  r3=block counter 
16831675
16841676@  Do  as much preparatory stuff as possible th at  doesn't involve the IV (to reduce interaction with it) 
1685-   push  {r0  - r3} 
1677+   push  {r1  - r3} 
16861678@ It 's OK for execution time to depend on the block counter r3 (" public") ,  but  not  the block number (secret) 
16871679
16881680 tst r3 , #(REFCHAFF_PERIOD - 1 ) 
16891681 bne 1f 
16901682  bl  refreshchaff_and_lfsr 
169116831 : 
16921684
1693-  ldr r3 ,[ r13 , # 12 ]             @ get block count off the stack 
1685+  ldr r3 ,[ r13 , # 8 ]               @ get block count off the stack
16941686 tst r3 , #(REMAP_PERIOD - 1 ) 
16951687 bne 1f 
16961688  bl  remap                    @ shuffle the LUTs ; this preserves R3 
@@ -1702,21 +1694,21 @@ ctr_crypt_mainloop:
17021694  bl  ref_roundkey_shares_s    @ refresh the round key shares 
170316951 : 
17041696
1705-  ldr r3 ,[ r13 , # 12 ]             @ get block count off the stack 
1697+  ldr r3 ,[ r13 , # 8 ]               @ get block count off the stack
17061698 tst r3 , #(REFROUNDKEYHVPERMS_PERIOD - 1 ) 
17071699 bne 1f 
17081700  bl  ref_roundkey_hvperms_s   @ refresh the round key vperms 
170917011 : 
17101702
17111703 CHK_COUNT  81 , 6 
17121704
1713-   pop  {r0  - r3} 
1714- @ r0=IV  ,   r1=cipher/plaintext buffer,  r2=number of blocks ,  r3=block counter 
1705+   pop  {r1  - r3} 
1706+ @ r1=cipher/plaintext buffer ,  r2=number of blocks ,  r3=block counter 
17151707
17161708@ Now calculate  r12  = block number - to - be - deciphered from r3 = block counter 
17171709.if CT_BPERM 
17181710@ Use a  "swap-or-not"  method to generate an  "oblivious"  permutation ; see makeperm.py version 7 
1719-   push  {r0  , r1} 
1711+   push  {r1} 
17201712 ldr r0 , =murmur3_constants 
17211713 ldmia r0 , { r9 - r12 , r14 }       @ load five murmur3_32 hash constants 
17221714 ldr r0 , =bperm_rand 
@@ -1752,57 +1744,53 @@ ctr_crypt_mainloop:
17521744 adds r4 , r4 , r7               @ r4=j if top bit of r6 ,  else i 
17531745 subs r1 , r1 , # 1 
17541746 bpl  1b 
1755-   pop  {r0  , r1} 
1747+   pop  {r1} 
17561748  mov   r12 , r4 
17571749.else 
17581750  mov   r12 , r3 
17591751.endif 
17601752 CHK_COUNT  82 , 6 
17611753
1762- @ r0=IV ,  r1=cipher/plaintext buffer ,  r2=number of blocks ,  r3=block counter (monotonic) ,   r12 =block number (block to be deciphered) 
1763-   push  {r0 - r3 , r12 } 
1754+ @ r1=cipher/plaintext buffer ,  r2=number of blocks ,  r3=block counter (monotonic) ,   r12 =block number (block to be deciphered) 
1755+   push  {r1 - r3 , r12 } 
1756+ @ r4 - r11  = IV0 ,   r12 =block number 
17641757
17651758processIV:                   @ non - target label to assist power analysis 
1766- 
1767- @ It is  not  clear if the following addition of the block number  in   r12  to the IV can usefully 
1768- @ be done  in  terms of shares. Instead we do an addition  and  subtraction whose overall effect 
1769- @ is the same ,   and  which provides a small degree of masking. The IV is  not  traditionally a secret , 
1770- @ though it will make it harder for the attacker if it is obscured. 
1771-   bl  gen_rand_sha 
1772-   movs   r8 , r0 , lsr# 16            @ only use  16  low bits so we don't get any overflows  in  the following ,   and  so th at  a carry from the first word is rare 
1773-   add   r9 , r8 , r12                @  "masked"  block number 
1774- @  r8 =random ,   r9 =(block number) + r8 ,  stack=IV , ... 
1775- 
1776-  ldr r0 ,[ r13 ]                 @ peek  at  stack to restore r0=IV ptr 
1777-  ldmia r0 , {r4 - r7}            @ load IV 
1778-  clear03                     @ barrier to remove traces of IV from internal  CPU  load registers 
1779- 
1780- @  Add   in   r9   in  byte - big - endian ,  bit - little - endian (!) fashion ,  while trying to avoid rev operations 
1781- @ as far as possible as these tend to expose (via power fluctuations) byte - level hamming weights. 
1782- @ First do  128 - bit addition of  r9  to byte - reversed IV 
1783-  rev r7 , r7 
1784-  cmn r7 , #MAX_NUM_BLOCKS      @ Compare against maximum number of blocks 
1785-  bcs 1f 
1786-   add  r7 , r7 , r9                 @ This can temporarily overflow but it doesn't matter as we know th at  r7 + r12  does  not  overflow 
1787-   sub  r7 , r7 , r8 
1788-  b 2f 
1789- 1 : 
1790-             adds r7 , r7 , r9 
1791-  rev r6 , r6 ; adcs r6,r6,#0 
1792-  rev r5 , r5 ; adcs r5,r5,#0 
1793-  rev r4 , r4 ; adcs r4,r4,#0 
1794- @ Now do  128 - bit subtraction of  r8  from byte - reversed IV 
1795-  subs r7 , r7 , r8 
1796-  sbcs r6 , r6 , # 0 ; rev r6,r6 
1797-  sbcs r5 , r5 , # 0 ; rev r5,r5 
1798-  sbcs r4 , r4 , # 0 ; rev r4,r4 
1799- 2 : 
1800-  rev r7 , r7 
1801-  clear01  16 
1759+  ldr  r8 , =IV0 
1760+  ldmia  r8 , {r4 - r7}            @ load IV0_A 
1761+  clear03  16 
1762+   add   r8 , r8 , # 20 
1763+  ldmia  r8 , { r8 - r11 }           @ load IV0_B 
1764+  clear03  32 
1765+  rev r0 , r12 
1766+  eor r7 , r7 , r0                @  XOR   in  block number to IV0. IV(block n) = IV0 ^ n ,  cf standard CTR mode IV0  +  n. 
1767+                              @  XOR  (vs addition) is compatible with  XOR - shares ,  so stealthier/simpler because don't have to unshare to work  out  IV(block n) 
1768+ @ r4 - r11  = IV for the current block 
18021769 CHK_COUNT  83 , 6 
1770+ .if ST_SHAREC 
1771+   bl  gen_rand_sha_nonpres     @ Create state share C ; all bytes the same 
1772+  ands r0 , r0 , # 255 
1773+  orrs r0 , r0 , r0 , lsl # 8 
1774+  orrs  r12 , r0 , r0 , lsl # 16 
1775+  ldr r1 , =shareC 
1776+   str   r12 ,[ r1 ] 
1777+ .else 
1778+   movs   r12 , # 0 
1779+ .endif 
1780+ @ r4 - r11  = IV for the current block w/o shareC ,   r12 =shareC 
1781+ @ refresh state shares  and  mix  in  shareC 
1782+   bl  gen_rand_sha_nonpres ; eors r4,r4,r0; eor r4,r4,r12; movs r1,#0; eor r8, r8, r0,ror#16   @ Barriers between shares to prevent implicit r4^r8 etc 
1783+   bl  gen_rand_sha_nonpres ; eors r5,r5,r0; eor r5,r5,r12; movs r1,#0; eor r9, r9, r0,ror#16 
1784+   bl  gen_rand_sha_nonpres ; eors r6,r6,r0; eor r6,r6,r12; movs r1,#0; eor r10,r10,r0,ror#16 
1785+   bl  gen_rand_sha_nonpres ; eors r7,r7,r0; eor r7,r7,r12; movs r1,#0; eor r11,r11,r0,ror#16 
1786+ .if ST_VPERM 
1787+   bl  gen_rand_sha_nonpres 
1788+  ldr r1 , =statevperm 
1789+   movs  r2 , # 0 
1790+   str  r2 ,[ r1 ] 
1791+   bl  addstatevperm            @ Initialise state vperm (use SHA RNG to start with ,  later refreshes are with LFSR RNG) 
1792+ .endif 
18031793
1804- @ r4 - r7 = IV for the current block 
1805-   bl  ns_to_s                  @ convert IV + x to shares ,  which includes choosing  and  incorporating a random shareC 
18061794 CHK_COUNT  84 , 6 
18071795  bl  conjshareC               @  Add  the effect of shareC to lut_a ,  lut_b 
18081796 CHK_COUNT  85 , 6 
@@ -1849,9 +1837,9 @@ rounds_s_mainloop:
18491837  bl  addstatevperm 
18501838.endif 
18511839
1852-   pop  {r0  - r3 , r12 } 
1853-   push  {r0  , r3} 
1854- @ r0=IV  ,   r1=cipher/plaintext buffer,  r2=number of blocks ,  r3=block counter ,   r12 =block to be deciphered 
1840+   pop  {r1  - r3 , r12 } 
1841+   push  {r3} 
1842+ @ r1=cipher/plaintext buffer ,  r2=number of blocks ,  r3=block counter ,   r12 =block to be deciphered 
18551843
18561844decryption_start: 
18571845@ Decrypt ciphertext using AES output  in  shares: r4 - r11 
@@ -1893,8 +1881,8 @@ decryption_start:
18931881  sub  r1 , r1 , r12 , lsl # 4          @ Restore r1 to point to start of buffer 
18941882 CHK_COUNT  90 , 6 
18951883
1896-   pop  {r0  , r3}                 @ Restore IV   and  block counter 
1897- @ r0=IV  ,   r1=cipher/plaintext buffer,  r2=number of blocks ,  r3=block counter 
1884+   pop  {r3}                    @ Restore  block counter 
1885+ @ r1=cipher/plaintext buffer ,  r2=number of blocks ,  r3=block counter 
18981886decryption_end: 
18991887
19001888 adds r3 , r3 , # 1 
0 commit comments