Skip to content

Commit 14c6ae1

Browse files
committed
Reimplement decodeASCII and decodeLatin1 to share C code
1 parent 5945caf commit 14c6ae1

File tree

5 files changed

+89
-36
lines changed

5 files changed

+89
-36
lines changed

cbits/cbits.c

Lines changed: 0 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -55,26 +55,6 @@ decode(uint32_t *state, uint32_t* codep, uint32_t byte) {
5555
return *state = utf8d[256 + *state + type];
5656
}
5757

58-
size_t
59-
_hs_text_decode_latin1(uint8_t *dest, const uint8_t *src,
60-
const uint8_t *srcend)
61-
{
62-
const uint8_t *dest0 = dest;
63-
const uint8_t *p = src;
64-
65-
while (p != srcend){
66-
uint8_t codepoint = *p++;
67-
if(codepoint < 0x80){
68-
*dest++ = (uint8_t)codepoint;
69-
} else {
70-
*dest++ = (uint8_t) (0xC0 + (codepoint >> 6));
71-
*dest++ = (uint8_t) (0x80 + (codepoint & 0x3F));
72-
}
73-
}
74-
75-
return (dest - dest0);
76-
}
77-
7858
/*
7959
* A best-effort decoder. Runs until it hits either end of input or
8060
* the start of an invalid byte sequence.

cbits/is_ascii.c

Lines changed: 47 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,47 @@
1+
/*
2+
* Copyright (c) 2021 Andrew Lelechenko <[email protected]>
3+
*/
4+
5+
#include <string.h>
6+
#include <stdint.h>
7+
#include <sys/types.h>
8+
#ifdef __x86_64__
9+
#include <emmintrin.h>
10+
#include <xmmintrin.h>
11+
#endif
12+
#include <stdbool.h>
13+
14+
/*
15+
_hs_text_is_ascii takes a UTF-8 encoded buffer,
16+
and returns the length of the ASCII-compatible prefix.
17+
*/
18+
const size_t _hs_text_is_ascii(const uint8_t *src0, const uint8_t *srcend){
19+
const uint8_t *src = src0;
20+
21+
#ifdef __x86_64__
22+
// I experimented with larger vector registers,
23+
// but did not notice any measurable speed up, so let's keep it simple.
24+
while (src < srcend - 15){
25+
__m128i w128 = _mm_loadu_si128((__m128i *)src);
26+
// Which bytes are < 128?
27+
uint16_t mask = _mm_movemask_epi8(w128);
28+
if (mask) break;
29+
src+= 16;
30+
}
31+
#endif
32+
33+
while (src < srcend - 7){
34+
uint64_t w64;
35+
memcpy(&w64, src, sizeof(uint64_t));
36+
if (w64 & 0x8080808080808080ULL) break;
37+
src+= 8;
38+
}
39+
40+
while (src < srcend){
41+
uint8_t leadByte = *src;
42+
if(leadByte >= 0x80) break;
43+
src++;
44+
}
45+
46+
return src - src0;
47+
}

src/Data/Text/Encoding.hs

Lines changed: 37 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -64,21 +64,24 @@ import Control.Monad.ST.Unsafe (unsafeIOToST, unsafeSTToIO)
6464

6565
import Control.Exception (evaluate, try, throwIO, ErrorCall(ErrorCall))
6666
import Control.Monad.ST (runST)
67-
import Data.ByteString as B
67+
import Data.Bits (shiftR, (.&.))
68+
import Data.ByteString (ByteString)
69+
import qualified Data.ByteString as B
6870
import qualified Data.ByteString.Internal as B
71+
import qualified Data.ByteString.Short.Internal as SBS
6972
import Data.Foldable (traverse_)
7073
import Data.Text.Encoding.Error (OnDecodeError, UnicodeException, strictDecode, lenientDecode)
71-
import Data.Text.Internal (Text(..), safe, text)
74+
import Data.Text.Internal (Text(..), safe, empty, text)
7275
import Data.Text.Internal.Private (runText)
7376
import Data.Text.Internal.Unsafe (unsafeWithForeignPtr)
7477
import Data.Text.Internal.Unsafe.Char (unsafeWrite)
7578
import Data.Text.Show ()
7679
import Data.Text.Unsafe (unsafeDupablePerformIO)
7780
import Data.Word (Word8, Word32)
78-
import Foreign.C.Types (CSize)
81+
import Foreign.C.Types (CSize(..))
7982
import Foreign.Marshal.Utils (with)
8083
import Foreign.Ptr (Ptr, minusPtr, nullPtr, plusPtr)
81-
import Foreign.Storable (Storable, peek, poke)
84+
import Foreign.Storable (Storable, peek, poke, peekByteOff)
8285
import GHC.Exts (MutableByteArray#, byteArrayContents#, unsafeCoerce#)
8386
import GHC.ForeignPtr (ForeignPtr(..), ForeignPtrContents(PlainPtr))
8487
import qualified Data.ByteString.Builder as B
@@ -112,7 +115,13 @@ import GHC.Stack (HasCallStack)
112115
-- | /Deprecated/. Decode a 'ByteString' containing 7-bit ASCII
113116
-- encoded text.
114117
decodeASCII :: ByteString -> Text
115-
decodeASCII = decodeUtf8
118+
decodeASCII bs = withBS bs $ \fp len -> if len == 0 then empty else runST $ do
119+
asciiPrefixLen <- fmap cSizeToInt $ unsafeIOToST $ unsafeWithForeignPtr fp $ \src ->
120+
c_is_ascii src (src `plusPtr` len)
121+
if asciiPrefixLen == len
122+
then let !(SBS.SBS arr) = SBS.toShort bs in
123+
return (Text (A.ByteArray arr) 0 len)
124+
else error $ "decodeASCII: detected non-ASCII codepoint at " ++ show asciiPrefixLen
116125
{-# DEPRECATED decodeASCII "Use decodeUtf8 instead" #-}
117126

118127
-- | Decode a 'ByteString' containing Latin-1 (aka ISO-8859-1) encoded text.
@@ -124,13 +133,29 @@ decodeLatin1 ::
124133
HasCallStack =>
125134
#endif
126135
ByteString -> Text
127-
decodeLatin1 bs = withBS bs aux where
128-
aux fp len = text a 0 actualLen
129-
where
130-
(a, actualLen) = A.run2 (A.new (2 * len) >>= unsafeIOToST . go)
131-
go (A.MutableByteArray dest) = unsafeWithForeignPtr fp $ \src -> do
132-
destLen <- c_decode_latin1 dest src (src `plusPtr` len)
133-
return (A.MutableByteArray dest, destLen)
136+
decodeLatin1 bs = withBS bs $ \fp len -> runST $ do
137+
dst <- A.new (2 * len)
138+
let inner srcOff dstOff = if srcOff >= len then return dstOff else do
139+
asciiPrefixLen <- fmap cSizeToInt $ unsafeIOToST $ unsafeWithForeignPtr fp $ \src ->
140+
c_is_ascii (src `plusPtr` srcOff) (src `plusPtr` len)
141+
if asciiPrefixLen == 0
142+
then do
143+
byte <- unsafeIOToST $ unsafeWithForeignPtr fp $ \src -> peekByteOff src srcOff
144+
A.unsafeWrite dst dstOff (0xC0 + (byte `shiftR` 6))
145+
A.unsafeWrite dst (dstOff + 1) (0x80 + (byte .&. 0x3F))
146+
inner (srcOff + 1) (dstOff + 2)
147+
else do
148+
unsafeIOToST $ unsafeWithForeignPtr fp $ \src ->
149+
unsafeSTToIO $ A.copyFromPointer dst dstOff (src `plusPtr` srcOff) asciiPrefixLen
150+
inner (srcOff + asciiPrefixLen) (dstOff + asciiPrefixLen)
151+
152+
actualLen <- inner 0 0
153+
dst' <- A.resizeM dst actualLen
154+
arr <- A.unsafeFreeze dst'
155+
return $ Text arr 0 actualLen
156+
157+
foreign import ccall unsafe "_hs_text_is_ascii" c_is_ascii
158+
:: Ptr Word8 -> Ptr Word8 -> IO CSize
134159

135160
-- | Decode a 'ByteString' containing UTF-8 encoded text.
136161
--
@@ -538,6 +563,3 @@ foreign import ccall unsafe "_hs_text_decode_utf8_state" c_decode_utf8_with_stat
538563
:: MutableByteArray# s -> Ptr CSize
539564
-> Ptr (Ptr Word8) -> Ptr Word8
540565
-> Ptr CodePoint -> Ptr DecoderState -> IO (Ptr Word8)
541-
542-
foreign import ccall unsafe "_hs_text_decode_latin1" c_decode_latin1
543-
:: MutableByteArray# s -> Ptr Word8 -> Ptr Word8 -> IO Int

src/Data/Text/Lazy/Encoding.hs

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,8 @@
11
{-# LANGUAGE BangPatterns,CPP #-}
22
{-# LANGUAGE Trustworthy #-}
3+
4+
{-# OPTIONS_GHC -fno-warn-deprecations #-}
5+
36
-- |
47
-- Module : Data.Text.Lazy.Encoding
58
-- Copyright : (c) 2009, 2010 Bryan O'Sullivan
@@ -80,7 +83,7 @@ import Data.Text.Unsafe (unsafeDupablePerformIO)
8083
-- | /Deprecated/. Decode a 'ByteString' containing 7-bit ASCII
8184
-- encoded text.
8285
decodeASCII :: B.ByteString -> Text
83-
decodeASCII = decodeUtf8
86+
decodeASCII = foldr (chunk . TE.decodeASCII) empty . B.toChunks
8487
{-# DEPRECATED decodeASCII "Use decodeUtf8 instead" #-}
8588

8689
-- | Decode a 'ByteString' containing Latin-1 (aka ISO-8859-1) encoded text.

text.cabal

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -65,6 +65,7 @@ flag developer
6565

6666
library
6767
c-sources: cbits/cbits.c
68+
cbits/is_ascii.c
6869
cbits/measure_off.c
6970
cbits/reverse.c
7071
cbits/utils.c

0 commit comments

Comments
 (0)