|
30 | 30 | #include <stdlib.h>
|
31 | 31 | #include <string.h>
|
32 | 32 |
|
| 33 | +#ifdef __GNUC__ |
| 34 | +#define FALLTHROUGH __attribute__((fallthrough)) |
| 35 | +#else |
| 36 | +#define FALLTHROUGH ((void)0) /* FALLTHROUGH */ |
| 37 | +#endif |
| 38 | + |
33 | 39 | // adapted from "Hacker's Delight" - Figure 7-2 Transposing an 8x8-bit matrix
|
34 | 40 | // basic idea is:
|
35 | 41 | // > First, treat the 8x8-bit matrix as 16 2x2-bit matrices, and transpose each
|
|
40 | 46 | // > illustrated below.
|
41 | 47 | // We want a different definition of bit/byte order, deal with strides differently, etc.
|
42 | 48 | // so the code is heavily re-worked compared to the original.
|
43 |
| -static void transpose8(uint32_t *result, const uint8_t *src, int src_stride) { |
| 49 | +static void transpose_var(uint32_t *result, const uint8_t *src, int src_stride, int num_strands) { |
| 50 | + uint32_t x = 0, y = 0, t; |
| 51 | + |
| 52 | + src += (num_strands-1) * src_stride; |
| 53 | + |
| 54 | + switch(num_strands) { |
| 55 | + case 7: |
| 56 | + x |= *src << 16; |
| 57 | + src -= src_stride; |
| 58 | + FALLTHROUGH; |
| 59 | + case 6: |
| 60 | + x |= *src << 8; |
| 61 | + src -= src_stride; |
| 62 | + FALLTHROUGH; |
| 63 | + case 5: |
| 64 | + x |= *src; |
| 65 | + src -= src_stride; |
| 66 | + FALLTHROUGH; |
| 67 | + case 4: |
| 68 | + y |= *src << 24; |
| 69 | + src -= src_stride; |
| 70 | + FALLTHROUGH; |
| 71 | + case 3: |
| 72 | + y |= *src << 16; |
| 73 | + src -= src_stride; |
| 74 | + FALLTHROUGH; |
| 75 | + case 2: |
| 76 | + y |= *src << 8; |
| 77 | + src -= src_stride; |
| 78 | + y |= *src; |
| 79 | + } |
| 80 | + |
| 81 | + t = (x ^ (x >> 7)) & 0x00AA00AA; x = x ^ t ^ (t << 7); |
| 82 | + t = (y ^ (y >> 7)) & 0x00AA00AA; y = y ^ t ^ (t << 7); |
| 83 | + |
| 84 | + t = (x ^ (x >>14)) & 0x0000CCCC; x = x ^ t ^ (t <<14); |
| 85 | + t = (y ^ (y >>14)) & 0x0000CCCC; y = y ^ t ^ (t <<14); |
| 86 | + |
| 87 | + t = (x & 0xF0F0F0F0) | ((y >> 4) & 0x0F0F0F0F); |
| 88 | + y = ((x << 4) & 0xF0F0F0F0) | (y & 0x0F0F0F0F); |
| 89 | + x = t; |
| 90 | + |
| 91 | +#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__ |
| 92 | + x = __builtin_bswap32(x); |
| 93 | + y = __builtin_bswap32(y); |
| 94 | +#endif |
| 95 | + result[0] = x; |
| 96 | + result[1] = y; |
| 97 | +} |
| 98 | + |
| 99 | +static void transpose_8(uint32_t *result, const uint8_t *src, int src_stride) { |
44 | 100 | uint32_t x, y, t;
|
45 | 101 |
|
46 | 102 | y = *src; src += src_stride;
|
@@ -70,14 +126,26 @@ static void transpose8(uint32_t *result, const uint8_t *src, int src_stride) {
|
70 | 126 | result[1] = y;
|
71 | 127 | }
|
72 | 128 |
|
73 |
| -static void bit_transpose(uint32_t *result, const uint8_t *src, size_t src_stride, size_t n) { |
| 129 | +static void bit_transpose_8(uint32_t *result, const uint8_t *src, size_t src_stride, size_t n) { |
74 | 130 | for(size_t i=0; i<n; i++) {
|
75 |
| - transpose8(result, src, src_stride); |
| 131 | + transpose_8(result, src, src_stride); |
76 | 132 | result += 2;
|
77 | 133 | src += 1;
|
78 | 134 | }
|
79 | 135 | }
|
80 | 136 |
|
81 |
| -void common_hal_bit_transpose_bit_transpose(uint8_t *result, const uint8_t *src, size_t n) { |
82 |
| - bit_transpose((uint32_t*)(void*)result, src, n/8, n/8); |
| 137 | +static void bit_transpose_var(uint32_t *result, const uint8_t *src, size_t src_stride, size_t n, int num_strands) { |
| 138 | + for(size_t i=0; i<n; i++) { |
| 139 | + transpose_var(result, src, src_stride, num_strands); |
| 140 | + result += 2; |
| 141 | + src += 1; |
| 142 | + } |
| 143 | +} |
| 144 | + |
| 145 | +void common_hal_bit_transpose_bit_transpose(uint8_t *result, const uint8_t *src, size_t inlen, size_t num_strands) { |
| 146 | + if(num_strands == 8) { |
| 147 | + bit_transpose_8((uint32_t*)(void*)result, src, inlen/8, inlen/8); |
| 148 | + } else { |
| 149 | + bit_transpose_var((uint32_t*)(void*)result, src, inlen/num_strands, inlen/num_strands, num_strands); |
| 150 | + } |
83 | 151 | }
|
0 commit comments