Skip to content

Commit 7ad8768

Browse files
ethercrowBodigrim
authored andcommitted
Use SSE2 in the x86_64 C version of decodeLatin1
1 parent c7e9611 commit 7ad8768

File tree

3 files changed

+31
-1
lines changed

3 files changed

+31
-1
lines changed

benchmarks/haskell/Benchmarks.hs

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -44,6 +44,7 @@ main = do
4444
, env (DecodeUtf8.initEnv (tf "ascii.txt")) (DecodeUtf8.benchmark "ascii")
4545
, env (DecodeUtf8.initEnv (tf "russian.txt")) (DecodeUtf8.benchmark "russian")
4646
, env (DecodeUtf8.initEnv (tf "japanese.txt")) (DecodeUtf8.benchmark "japanese")
47+
, env (DecodeUtf8.initEnv (tf "ascii.txt")) (DecodeUtf8.benchmarkASCII)
4748
, EncodeUtf8.benchmark "επανάληψη 竺法蘭共譯"
4849
, env (Equality.initEnv (tf "japanese.txt")) Equality.benchmark
4950
, FileRead.benchmark (tf "russian.txt")

benchmarks/haskell/Benchmarks/DecodeUtf8.hs

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,7 @@
1717
module Benchmarks.DecodeUtf8
1818
( initEnv
1919
, benchmark
20+
, benchmarkASCII
2021
) where
2122

2223
import Foreign.C.Types
@@ -62,6 +63,17 @@ benchmark kind ~(bs, lbs) =
6263
, bench "LazyInitLength" $ nf (TL.length . TL.init . TL.decodeUtf8) lbs
6364
]
6465

66+
benchmarkASCII :: Env -> Benchmark
67+
benchmarkASCII ~(bs, lbs) =
68+
bgroup "DecodeASCII"
69+
[ C.bench "strict decodeUtf8" $ nf T.decodeUtf8 bs
70+
, C.bench "strict decodeLatin1" $ nf T.decodeLatin1 bs
71+
, C.bench "strict decodeASCII" $ nf T.decodeASCII bs
72+
, C.bench "lazy decodeUtf8" $ nf TL.decodeUtf8 lbs
73+
, C.bench "lazy decodeLatin1" $ nf TL.decodeLatin1 lbs
74+
, C.bench "lazy decodeASCII" $ nf TL.decodeASCII lbs
75+
]
76+
6577
iconv :: B.ByteString -> IO CInt
6678
iconv (PS fp off len) = withForeignPtr fp $ \ptr ->
6779
time_iconv (ptr `plusPtr` off) (fromIntegral len)

cbits/cbits.c

Lines changed: 18 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -16,7 +16,6 @@
1616

1717
#include "text_cbits.h"
1818

19-
2019
void _hs_text_memcpy(void *dest, size_t doff, const void *src, size_t soff,
2120
size_t n)
2221
{
@@ -88,6 +87,23 @@ _hs_text_decode_latin1(uint16_t *dest, const uint8_t *src,
8887
while (p != srcend && (uintptr_t)p & 0x3)
8988
*dest++ = *p++;
9089

90+
#if defined(__x86_64__)
91+
/* All the intrinsics used here are from SSE2,
92+
* so every x86_64 CPU supports them.
93+
*/
94+
const __m128i zeros = _mm_set1_epi32(0);
95+
while (p < srcend - 7) {
96+
/* Load 8 bytes of ASCII data */
97+
const __m128i ascii = _mm_cvtsi64_si128(*((const uint64_t *)p));
98+
/* Interleave with zeros */
99+
const __m128i utf16 = _mm_unpacklo_epi8(ascii, zeros);
100+
/* Store the resulting 16 bytes into destination */
101+
_mm_storeu_si128((__m128i *)dest, utf16);
102+
103+
dest += 8;
104+
p += 8;
105+
}
106+
#else
91107
/* iterate over 32-bit aligned loads */
92108
while (p < srcend - 3) {
93109
const uint32_t w = *((const uint32_t *)p);
@@ -99,6 +115,7 @@ _hs_text_decode_latin1(uint16_t *dest, const uint8_t *src,
99115

100116
p += 4;
101117
}
118+
#endif
102119
#endif
103120

104121
/* handle unaligned suffix */

0 commit comments

Comments
 (0)