Skip to content

Commit 00c6bcf

Browse files
no avx2 for url
1 parent 2345253 commit 00c6bcf

File tree

2 files changed

+16
-41
lines changed

2 files changed

+16
-41
lines changed

source/arch/intel/encoding_avx2.c

Lines changed: 10 additions & 40 deletions
Original file line numberDiff line numberDiff line change
@@ -52,28 +52,19 @@ static inline __m256i translate_exact(__m256i in, uint8_t match, uint8_t decode)
5252
* on decode failure, returns false, else returns true on success.
5353
*/
5454
static inline bool decode_vec(__m256i *in) {
55-
__m256i tmp1, tmp2, tmp3, tmp4, tmp5;
55+
__m256i tmp1, tmp2, tmp3;
5656

5757
/*
5858
* Base64 decoding table, see RFC4648
5959
*
6060
* Note that we use multiple vector registers to try to allow the CPU to
61-
* parallelize the merging ORs
61+
* paralellize the merging ORs
6262
*/
6363
tmp1 = translate_range(*in, 'A', 'Z', 0 + 1);
6464
tmp2 = translate_range(*in, 'a', 'z', 26 + 1);
6565
tmp3 = translate_range(*in, '0', '9', 52 + 1);
66-
// Handle both '+' and '-' for value 62
67-
tmp4 = translate_exact(*in, '+', 62 + 1);
68-
tmp4 = _mm256_or_si256(tmp4, translate_exact(*in, '-', 62 + 1));
69-
70-
// Handle both '/' and '_' for value 63
71-
tmp5 = translate_exact(*in, '/', 63 + 1);
72-
tmp5 = _mm256_or_si256(tmp5, translate_exact(*in, '_', 63 + 1));
73-
74-
// Combine all results
75-
tmp1 = _mm256_or_si256(tmp1, tmp4);
76-
tmp2 = _mm256_or_si256(tmp2, tmp5);
66+
tmp1 = _mm256_or_si256(tmp1, translate_exact(*in, '+', 62 + 1));
67+
tmp2 = _mm256_or_si256(tmp2, translate_exact(*in, '/', 63 + 1));
7768
tmp3 = _mm256_or_si256(tmp3, _mm256_or_si256(tmp1, tmp2));
7869

7970
/*
@@ -273,28 +264,11 @@ static inline __m256i encode_chars(__m256i in) {
273264
return _mm256_or_si256(tmp3, _mm256_or_si256(tmp1, tmp2));
274265
}
275266

276-
static inline __m256i encode_chars_url_safe(__m256i in) {
277-
__m256i tmp1, tmp2, tmp3;
278-
279-
/*
280-
* Base64 URL encoding table, see RFC4648
281-
*
282-
* We again use fan-in for the ORs here.
283-
*/
284-
tmp1 = translate_range(in, 0, 25, 'A');
285-
tmp2 = translate_range(in, 26, 26 + 25, 'a');
286-
tmp3 = translate_range(in, 52, 61, '0');
287-
tmp1 = _mm256_or_si256(tmp1, translate_exact(in, 62, '-'));
288-
tmp2 = _mm256_or_si256(tmp2, translate_exact(in, 63, '_'));
289-
290-
return _mm256_or_si256(tmp3, _mm256_or_si256(tmp1, tmp2));
291-
}
292-
293267
/*
294268
* Input: A 256-bit vector, interpreted as 24 bytes (LSB) plus 8 bytes of high-byte padding
295269
* Output: A 256-bit vector of base64 characters
296270
*/
297-
static inline __m256i encode_stride(__m256i vec, bool url_safe) {
271+
static inline __m256i encode_stride(__m256i vec) {
298272
/*
299273
* First, since byte-shuffle operations operate within 128-bit subvectors, swap around the dwords
300274
* to balance the amount of actual data between 128-bit subvectors.
@@ -355,14 +329,10 @@ static inline __m256i encode_stride(__m256i vec, bool url_safe) {
355329
vec = _mm256_or_si256(_mm256_or_si256(digit0, digit1), _mm256_or_si256(digit2, digit3));
356330

357331
/* Finally translate to the base64 character set */
358-
return url_safe ? encode_chars(vec) : encode_chars_url_safe(vec);
332+
return encode_chars(vec);
359333
}
360334

361-
void aws_common_private_base64_encode_sse41(
362-
const uint8_t *input,
363-
uint8_t *output,
364-
size_t inlen,
365-
bool url_safe_encoding) {
335+
void aws_common_private_base64_encode_sse41(const uint8_t *input, uint8_t *output, size_t inlen) {
366336
__m256i instride, outstride;
367337

368338
while (inlen >= 32) {
@@ -372,7 +342,7 @@ void aws_common_private_base64_encode_sse41(
372342
* of unreadable pages, so we use bounce buffers below.
373343
*/
374344
instride = _mm256_loadu_si256((__m256i const *)input);
375-
outstride = encode_stride(instride, url_safe_encoding);
345+
outstride = encode_stride(instride);
376346
_mm256_storeu_si256((__m256i *)output, outstride);
377347

378348
input += 24;
@@ -391,10 +361,10 @@ void aws_common_private_base64_encode_sse41(
391361
memset(&instride, 0, sizeof(instride));
392362
memcpy(&instride, input, stridelen);
393363

394-
outstride = encode_stride(instride, url_safe_encoding);
364+
outstride = encode_stride(instride);
395365
memcpy(output, &outstride, outlen);
396366

397-
if (!url_safe_encoding && inlen < 24) {
367+
if (inlen < 24) {
398368
if (inlen % 3 >= 1) {
399369
/* AA== or AAA= */
400370
output[outlen - 1] = '=';

source/encoding.c

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -337,7 +337,12 @@ static int s_base64_encode(
337337

338338
AWS_ASSERT(needed_capacity == 0 || output->buffer != NULL);
339339

340-
if (aws_common_private_has_avx2()) {
340+
/*
341+
* Note: avx2 impl currently does not support url base64 (no padding -> output not divisible by 4 -> it writes out
342+
* of bounds). Just use software version for now (since need for base64 url is small) instead of hacking together
343+
* half hearted avx2 impl.
344+
*/
345+
if (!do_url_safe_encoding && aws_common_private_has_avx2()) {
341346
aws_common_private_base64_encode_sse41(
342347
to_encode->ptr, output->buffer + output->len, to_encode->len, do_url_safe_encoding);
343348
output->len += encoded_length;

0 commit comments

Comments
 (0)