Skip to content

Commit 9075b05

Browse files
committed
Better fallback when simdutf is not available
1 parent 835ccd6 commit 9075b05

File tree

2 files changed

+33
-13
lines changed

2 files changed

+33
-13
lines changed

src/Data/Text/Encoding.hs

Lines changed: 28 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -79,9 +79,6 @@ import Data.Text.Show as T (singleton)
7979
import Data.Text.Unsafe (unsafeDupablePerformIO)
8080
import Data.Word (Word8)
8181
import Foreign.C.Types (CSize(..))
82-
#ifdef SIMDUTF
83-
import Foreign.C.Types (CInt(..))
84-
#endif
8582
import Foreign.Ptr (Ptr, minusPtr, plusPtr)
8683
import Foreign.Storable (poke, peekByteOff)
8784
import GHC.Exts (byteArrayContents#, unsafeCoerce#)
@@ -99,6 +96,13 @@ import Data.Text.Internal.ByteStringCompat
9996
import GHC.Stack (HasCallStack)
10097
#endif
10198

99+
#ifdef SIMDUTF
100+
import Foreign.C.Types (CInt(..))
101+
#else
102+
import qualified Data.ByteString.Unsafe as B
103+
import Data.Text.Internal.Encoding.Utf8 (CodePoint(..))
104+
#endif
105+
102106
-- $strict
103107
--
104108
-- All of the single-parameter functions for decoding bytestrings
@@ -164,10 +168,30 @@ decodeLatin1 bs = withBS bs $ \fp len -> runST $ do
164168
foreign import ccall unsafe "_hs_text_is_ascii" c_is_ascii
165169
:: Ptr Word8 -> Ptr Word8 -> IO CSize
166170

167-
#ifdef SIMDUTF
168171
isValidBS :: ByteString -> Bool
172+
#ifdef SIMDUTF
169173
isValidBS bs = withBS bs $ \fp len -> unsafeDupablePerformIO $
170174
unsafeWithForeignPtr fp $ \ptr -> (/= 0) <$> c_is_valid_utf8 ptr (fromIntegral len)
175+
#else
176+
#if MIN_VERSION_bytestring(0,11,2)
177+
isValidBS = B.isValidUtf8
178+
#else
179+
isValidBS bs = start 0
180+
where
181+
start ix
182+
| ix >= B.length bs = True
183+
| otherwise = case utf8DecodeStart (B.unsafeIndex bs ix) of
184+
Accept{} -> start (ix + 1)
185+
Reject{} -> False
186+
Incomplete st _ -> step (ix + 1) st
187+
step ix st
188+
| ix >= B.length bs = False
189+
-- We do not use decoded code point, so passing a dummy value to save an argument.
190+
| otherwise = case utf8DecodeContinue (B.unsafeIndex bs ix) st (CodePoint 0) of
191+
Accept{} -> start (ix + 1)
192+
Reject{} -> False
193+
Incomplete st' _ -> step (ix + 1) st'
194+
#endif
171195
#endif
172196

173197
-- | Decode a 'ByteString' containing UTF-8 encoded text.
@@ -180,11 +204,9 @@ decodeUtf8With ::
180204
#endif
181205
OnDecodeError -> ByteString -> Text
182206
decodeUtf8With onErr bs
183-
#ifdef SIMDUTF
184207
| isValidBS bs =
185208
let !(SBS.SBS arr) = SBS.toShort bs in
186209
(Text (A.ByteArray arr) 0 (B.length bs))
187-
#endif
188210
| B.null undecoded = txt
189211
| otherwise = txt `append` (case onErr desc (Just (B.head undecoded)) of
190212
Nothing -> txt'
@@ -211,7 +233,6 @@ decodeUtf8With2 onErr bs1@(B.length -> len1) bs2@(B.length -> len2) = runST $ do
211233
| i < len1 = B.index bs1 i
212234
| otherwise = B.index bs2 (i - len1)
213235

214-
#ifdef SIMDUTF
215236
-- We need Data.ByteString.findIndexEnd, but it is unavailable before bytestring-0.10.12.0
216237
guessUtf8Boundary :: Int
217238
guessUtf8Boundary
@@ -226,7 +247,6 @@ decodeUtf8With2 onErr bs1@(B.length -> len1) bs2@(B.length -> len2) = runST $ do
226247
w1 = B.index bs2 (len2 - 2)
227248
w2 = B.index bs2 (len2 - 3)
228249
w3 = B.index bs2 (len2 - 4)
229-
#endif
230250

231251
decodeFrom :: Int -> DecoderResult
232252
decodeFrom off = step (off + 1) (utf8DecodeStart (index off))
@@ -244,7 +264,6 @@ decodeUtf8With2 onErr bs1@(B.length -> len1) bs2@(B.length -> len2) = runST $ do
244264
arr <- A.unsafeFreeze dst
245265
return (Text arr 0 dstOff, mempty)
246266

247-
#ifdef SIMDUTF
248267
| srcOff >= len1
249268
, srcOff < len1 + guessUtf8Boundary
250269
, dstOff + (len1 + guessUtf8Boundary - srcOff) <= dstLen
@@ -253,7 +272,6 @@ decodeUtf8With2 onErr bs1@(B.length -> len1) bs2@(B.length -> len2) = runST $ do
253272
withBS bs $ \fp _ -> unsafeIOToST $ unsafeWithForeignPtr fp $ \src ->
254273
unsafeSTToIO $ A.copyFromPointer dst dstOff src (len1 + guessUtf8Boundary - srcOff)
255274
inner (len1 + guessUtf8Boundary) (dstOff + (len1 + guessUtf8Boundary - srcOff))
256-
#endif
257275

258276
| dstOff + 4 > dstLen = do
259277
let dstLen' = dstLen + 4

src/Data/Text/Internal/Encoding/Utf8.hs

Lines changed: 5 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -35,6 +35,8 @@ module Data.Text.Internal.Encoding.Utf8
3535
, validate4
3636
-- * Naive decoding
3737
, DecoderResult(..)
38+
, DecoderState(..)
39+
, CodePoint(..)
3840
, utf8DecodeStart
3941
, utf8DecodeContinue
4042
) where
@@ -269,18 +271,18 @@ data DecoderResult
269271

270272
-- | @since 2.0
271273
utf8DecodeStart :: Word8 -> DecoderResult
272-
utf8DecodeStart w
274+
utf8DecodeStart !w
273275
| st == utf8AcceptState = Accept (chr (word8ToInt w))
274276
| st == utf8RejectState = Reject
275277
| otherwise = Incomplete st (CodePoint cp)
276278
where
277279
cl@(ByteClass cl') = byteToClass w
278280
st = updateState cl utf8AcceptState
279-
cp = word8ToInt $ (0xff `shiftR` word8ToInt cl') .&. w
281+
cp = word8ToInt $ (0xff `unsafeShiftR` word8ToInt cl') .&. w
280282

281283
-- | @since 2.0
282284
utf8DecodeContinue :: Word8 -> DecoderState -> CodePoint -> DecoderResult
283-
utf8DecodeContinue w st (CodePoint cp)
285+
utf8DecodeContinue !w !st (CodePoint !cp)
284286
| st' == utf8AcceptState = Accept (chr cp')
285287
| st' == utf8RejectState = Reject
286288
| otherwise = Incomplete st' (CodePoint cp')

0 commit comments

Comments
 (0)