@@ -79,9 +79,6 @@ import Data.Text.Show as T (singleton)
79
79
import Data.Text.Unsafe (unsafeDupablePerformIO )
80
80
import Data.Word (Word8 )
81
81
import Foreign.C.Types (CSize (.. ))
82
- #ifdef SIMDUTF
83
- import Foreign.C.Types (CInt (.. ))
84
- #endif
85
82
import Foreign.Ptr (Ptr , minusPtr , plusPtr )
86
83
import Foreign.Storable (poke , peekByteOff )
87
84
import GHC.Exts (byteArrayContents #, unsafeCoerce #)
@@ -99,6 +96,13 @@ import Data.Text.Internal.ByteStringCompat
99
96
import GHC.Stack (HasCallStack )
100
97
#endif
101
98
99
+ #ifdef SIMDUTF
100
+ import Foreign.C.Types (CInt (.. ))
101
+ #else
102
+ import qualified Data.ByteString.Unsafe as B
103
+ import Data.Text.Internal.Encoding.Utf8 (CodePoint (.. ))
104
+ #endif
105
+
102
106
-- $strict
103
107
--
104
108
-- All of the single-parameter functions for decoding bytestrings
@@ -164,10 +168,30 @@ decodeLatin1 bs = withBS bs $ \fp len -> runST $ do
164
168
foreign import ccall unsafe " _hs_text_is_ascii" c_is_ascii
165
169
:: Ptr Word8 -> Ptr Word8 -> IO CSize
166
170
167
- #ifdef SIMDUTF
168
171
isValidBS :: ByteString -> Bool
172
+ #ifdef SIMDUTF
169
173
isValidBS bs = withBS bs $ \ fp len -> unsafeDupablePerformIO $
170
174
unsafeWithForeignPtr fp $ \ ptr -> (/= 0 ) <$> c_is_valid_utf8 ptr (fromIntegral len)
175
+ #else
176
+ #if MIN_VERSION_bytestring(0,11,2)
177
+ isValidBS = B. isValidUtf8
178
+ #else
179
+ isValidBS bs = start 0
180
+ where
181
+ start ix
182
+ | ix >= B. length bs = True
183
+ | otherwise = case utf8DecodeStart (B. unsafeIndex bs ix) of
184
+ Accept {} -> start (ix + 1 )
185
+ Reject {} -> False
186
+ Incomplete st _ -> step (ix + 1 ) st
187
+ step ix st
188
+ | ix >= B. length bs = False
189
+ -- We do not use decoded code point, so passing a dummy value to save an argument.
190
+ | otherwise = case utf8DecodeContinue (B. unsafeIndex bs ix) st (CodePoint 0 ) of
191
+ Accept {} -> start (ix + 1 )
192
+ Reject {} -> False
193
+ Incomplete st' _ -> step (ix + 1 ) st'
194
+ #endif
171
195
#endif
172
196
173
197
-- | Decode a 'ByteString' containing UTF-8 encoded text.
@@ -180,11 +204,9 @@ decodeUtf8With ::
180
204
#endif
181
205
OnDecodeError -> ByteString -> Text
182
206
decodeUtf8With onErr bs
183
- #ifdef SIMDUTF
184
207
| isValidBS bs =
185
208
let ! (SBS. SBS arr) = SBS. toShort bs in
186
209
(Text (A. ByteArray arr) 0 (B. length bs))
187
- #endif
188
210
| B. null undecoded = txt
189
211
| otherwise = txt `append` (case onErr desc (Just (B. head undecoded)) of
190
212
Nothing -> txt'
@@ -211,7 +233,6 @@ decodeUtf8With2 onErr bs1@(B.length -> len1) bs2@(B.length -> len2) = runST $ do
211
233
| i < len1 = B. index bs1 i
212
234
| otherwise = B. index bs2 (i - len1)
213
235
214
- #ifdef SIMDUTF
215
236
-- We need Data.ByteString.findIndexEnd, but it is unavailable before bytestring-0.10.12.0
216
237
guessUtf8Boundary :: Int
217
238
guessUtf8Boundary
@@ -226,7 +247,6 @@ decodeUtf8With2 onErr bs1@(B.length -> len1) bs2@(B.length -> len2) = runST $ do
226
247
w1 = B. index bs2 (len2 - 2 )
227
248
w2 = B. index bs2 (len2 - 3 )
228
249
w3 = B. index bs2 (len2 - 4 )
229
- #endif
230
250
231
251
decodeFrom :: Int -> DecoderResult
232
252
decodeFrom off = step (off + 1 ) (utf8DecodeStart (index off))
@@ -244,7 +264,6 @@ decodeUtf8With2 onErr bs1@(B.length -> len1) bs2@(B.length -> len2) = runST $ do
244
264
arr <- A. unsafeFreeze dst
245
265
return (Text arr 0 dstOff, mempty )
246
266
247
- #ifdef SIMDUTF
248
267
| srcOff >= len1
249
268
, srcOff < len1 + guessUtf8Boundary
250
269
, dstOff + (len1 + guessUtf8Boundary - srcOff) <= dstLen
@@ -253,7 +272,6 @@ decodeUtf8With2 onErr bs1@(B.length -> len1) bs2@(B.length -> len2) = runST $ do
253
272
withBS bs $ \ fp _ -> unsafeIOToST $ unsafeWithForeignPtr fp $ \ src ->
254
273
unsafeSTToIO $ A. copyFromPointer dst dstOff src (len1 + guessUtf8Boundary - srcOff)
255
274
inner (len1 + guessUtf8Boundary) (dstOff + (len1 + guessUtf8Boundary - srcOff))
256
- #endif
257
275
258
276
| dstOff + 4 > dstLen = do
259
277
let dstLen' = dstLen + 4
0 commit comments