@@ -141,6 +141,44 @@ void ascii_pack_simd(const char* ascii, size_t len, uint8_t* bin) {
141141 ascii_pack (ascii, end - ascii, bin);
142142}
143143
144+ void ascii_pack_simd2 (const char * ascii, size_t len, uint8_t * bin) {
145+ // I leave out 16 bytes in addition to 16 that we load in the loop
146+ // because we store into bin full 16 bytes instead of 14. To prevent data
147+ // overwrite we finish loop one iteration earlier.
148+ const char * end = ascii + len - 32 ;
149+
150+ // Skips 8th byte (indexc 7) in the lower 8-byte part.
151+ const __m128i control = _mm_set_epi8 (-1 , -1 , 14 , 13 , 12 , 11 , 10 , 9 , 8 , 6 , 5 , 4 , 3 , 2 , 1 , 0 );
152+
153+ __m128i val, rpart, lpart;
154+
155+ // Based on the question I asked here: https://stackoverflow.com/q/74831843/2280111
156+ while (ascii <= end) {
157+ val = _mm_loadu_si128 (reinterpret_cast <const __m128i*>(ascii));
158+
159+ /*
160+ x = ((x & 0x7F007F007F007F00) >> 1) | (x & 0x007F007F007F007F);
161+ x = ((x & 0x3FFF00003FFF0000) >> 2) | (x & 0x00003FFF00003FFF);
162+ x = ((x & 0x0FFFFFFF00000000) >> 4) | (x & 0x000000000FFFFFFF);
163+ */
164+ val = _mm_maddubs_epi16 (_mm_set1_epi16 (0x8001 ), val);
165+ val = _mm_madd_epi16 (_mm_set1_epi32 (0x40000001 ), val);
166+
167+ rpart = _mm_and_si128 (val, _mm_set1_epi64x (0x000000000FFFFFFF ));
168+ lpart = _mm_and_si128 (val, _mm_set1_epi64x (0x0FFFFFFF00000000 ));
169+ val = _mm_or_si128 (_mm_srli_epi64 (lpart, 4 ), rpart);
170+
171+ val = _mm_shuffle_epi8 (val, control);
172+ _mm_storeu_si128 (reinterpret_cast <__m128i*>(bin), val);
173+ bin += 14 ;
174+ ascii += 16 ;
175+ }
176+
177+ end += 32 ; // Bring back end.
178+ DCHECK (ascii < end);
179+ ascii_pack (ascii, end - ascii, bin);
180+ }
181+
144182// unpacks 8->7 encoded blob back to ascii.
145183// generally, we can not unpack inplace because ascii (dest) buffer is 8/7 bigger than
146184// the source buffer.
0 commit comments