|
11 | 11 |
|
12 | 12 | #if defined(__i386__) || defined(__x86_64__)
|
13 | 13 |
|
| 14 | +/* |
| 15 | + * Force usage of rol or ror by selecting the one with the smaller constant. |
| 16 | + * It _can_ generate slightly smaller code (a constant of 1 is special), but |
| 17 | + * perhaps more importantly it's possibly faster on any uarch that does a |
| 18 | + * rotate with a loop. |
| 19 | + */ |
| 20 | + |
14 | 21 | #define SHA_ASM(op, x, n) ({ unsigned int __res; __asm__(op " %1,%0":"=r" (__res):"i" (n), "0" (x)); __res; })
|
15 | 22 | #define SHA_ROL(x,n) SHA_ASM("rol", x, n)
|
16 | 23 | #define SHA_ROR(x,n) SHA_ASM("ror", x, n)
|
17 |
| -#define SMALL_REGISTER_SET |
18 | 24 |
|
19 | 25 | #else
|
20 | 26 |
|
|
24 | 30 |
|
25 | 31 | #endif
|
26 | 32 |
|
27 |
| -/* This "rolls" over the 512-bit array */ |
28 |
| -#define W(x) (array[(x)&15]) |
29 |
| - |
30 | 33 | /*
|
31 | 34 | * If you have 32 registers or more, the compiler can (and should)
|
32 | 35 | * try to change the array[] accesses into registers. However, on
|
|
43 | 46 | * Ben Herrenschmidt reports that on PPC, the C version comes close
|
44 | 47 | * to the optimized asm with this (ie on PPC you don't want that
|
45 | 48 | * 'volatile', since there are lots of registers).
|
| 49 | + * |
| 50 | + * On ARM we get the best code generation by forcing a full memory barrier |
| 51 | + * between each SHA_ROUND, otherwise gcc happily get wild with spilling and |
| 52 | + * the stack frame size simply explode and performance goes down the drain. |
46 | 53 | */
|
47 |
| -#ifdef SMALL_REGISTER_SET |
| 54 | + |
| 55 | +#if defined(__i386__) || defined(__x86_64__) |
48 | 56 | #define setW(x, val) (*(volatile unsigned int *)&W(x) = (val))
|
| 57 | +#elif defined(__arm__) |
| 58 | + #define setW(x, val) do { W(x) = (val); __asm__("":::"memory"); } while (0) |
49 | 59 | #else
|
50 | 60 | #define setW(x, val) (W(x) = (val))
|
51 | 61 | #endif
|
52 | 62 |
|
| 63 | +/* This "rolls" over the 512-bit array */ |
| 64 | +#define W(x) (array[(x)&15]) |
| 65 | + |
53 | 66 | /*
|
54 | 67 | * Where do we get the source from? The first 16 iterations get it from
|
55 | 68 | * the input data, the next mix it from the 512-bit array.
|
|
0 commit comments