@@ -96,6 +96,7 @@ import qualified Data.Text.Array as A
96
96
import qualified Data.Text.Internal.Encoding.Fusion as E
97
97
import qualified Data.Text.Internal.Encoding.Utf16 as U16
98
98
import qualified Data.Text.Internal.Fusion as F
99
+ import Data.Text.Internal.ByteStringCompat
99
100
100
101
#include "text_cbits.h"
101
102
@@ -123,12 +124,13 @@ decodeASCII = decodeUtf8
123
124
-- 'decodeLatin1' is semantically equivalent to
124
125
-- @Data.Text.pack . Data.ByteString.Char8.unpack@
125
126
decodeLatin1 :: ByteString -> Text
126
- decodeLatin1 (PS fp off len) = text a 0 len
127
- where
128
- a = A. run (A. new len >>= unsafeIOToST . go)
129
- go dest = withForeignPtr fp $ \ ptr -> do
130
- c_decode_latin1 (A. maBA dest) (ptr `plusPtr` off) (ptr `plusPtr` (off+ len))
131
- return dest
127
+ decodeLatin1 bs = withBS bs aux where
128
+ aux fp len = text a 0 len
129
+ where
130
+ a = A. run (A. new len >>= unsafeIOToST . go)
131
+ go dest = withForeignPtr fp $ \ ptr -> do
132
+ c_decode_latin1 (A. maBA dest) ptr (ptr `plusPtr` len)
133
+ return dest
132
134
133
135
-- | Decode a 'ByteString' containing UTF-8 encoded text.
134
136
--
@@ -139,36 +141,38 @@ decodeLatin1 (PS fp off len) = text a 0 len
139
141
-- 'error' (/since 1.2.3.1/); For earlier versions of @text@ using
140
142
-- those unsupported code points would result in undefined behavior.
141
143
decodeUtf8With :: OnDecodeError -> ByteString -> Text
142
- decodeUtf8With onErr (PS fp off len) = runText $ \ done -> do
143
- let go dest = withForeignPtr fp $ \ ptr ->
144
- with (0 :: CSize ) $ \ destOffPtr -> do
145
- let end = ptr `plusPtr` (off + len)
146
- loop curPtr = do
147
- curPtr' <- c_decode_utf8 (A. maBA dest) destOffPtr curPtr end
148
- if curPtr' == end
149
- then do
150
- n <- peek destOffPtr
151
- unsafeSTToIO (done dest (fromIntegral n))
152
- else do
153
- x <- peek curPtr'
154
- case onErr desc (Just x) of
155
- Nothing -> loop $ curPtr' `plusPtr` 1
156
- Just c
157
- | c > ' \xFFFF ' -> throwUnsupportedReplChar
158
- | otherwise -> do
159
- destOff <- peek destOffPtr
160
- w <- unsafeSTToIO $
161
- unsafeWrite dest (fromIntegral destOff)
162
- (safe c)
163
- poke destOffPtr (destOff + fromIntegral w)
164
- loop $ curPtr' `plusPtr` 1
165
- loop (ptr `plusPtr` off)
166
- (unsafeIOToST . go) =<< A. new len
144
+ decodeUtf8With onErr bs = withBS bs aux
167
145
where
168
- desc = " Data.Text.Internal.Encoding.decodeUtf8: Invalid UTF-8 stream"
146
+ aux fp len = runText $ \ done -> do
147
+ let go dest = withForeignPtr fp $ \ ptr ->
148
+ with (0 :: CSize ) $ \ destOffPtr -> do
149
+ let end = ptr `plusPtr` len
150
+ loop curPtr = do
151
+ curPtr' <- c_decode_utf8 (A. maBA dest) destOffPtr curPtr end
152
+ if curPtr' == end
153
+ then do
154
+ n <- peek destOffPtr
155
+ unsafeSTToIO (done dest (fromIntegral n))
156
+ else do
157
+ x <- peek curPtr'
158
+ case onErr desc (Just x) of
159
+ Nothing -> loop $ curPtr' `plusPtr` 1
160
+ Just c
161
+ | c > ' \xFFFF ' -> throwUnsupportedReplChar
162
+ | otherwise -> do
163
+ destOff <- peek destOffPtr
164
+ w <- unsafeSTToIO $
165
+ unsafeWrite dest (fromIntegral destOff)
166
+ (safe c)
167
+ poke destOffPtr (destOff + fromIntegral w)
168
+ loop $ curPtr' `plusPtr` 1
169
+ loop ptr
170
+ (unsafeIOToST . go) =<< A. new len
171
+ where
172
+ desc = " Data.Text.Internal.Encoding.decodeUtf8: Invalid UTF-8 stream"
169
173
170
- throwUnsupportedReplChar = throwIO $
171
- ErrorCall " decodeUtf8With: non-BMP replacement characters not supported"
174
+ throwUnsupportedReplChar = throwIO $
175
+ ErrorCall " decodeUtf8With: non-BMP replacement characters not supported"
172
176
-- TODO: The code currently assumes that the transcoded UTF-16
173
177
-- stream is at most twice as long (in bytes) as the input UTF-8
174
178
-- stream. To justify this assumption one has to assume that the
@@ -292,50 +296,50 @@ streamDecodeUtf8With onErr = decodeChunk B.empty 0 0
292
296
-- potential surrogate pair started in the last buffer
293
297
decodeChunk :: ByteString -> CodePoint -> DecoderState -> ByteString
294
298
-> Decoding
295
- decodeChunk undecoded0 codepoint0 state0 bs@ ( PS fp off len) =
296
- runST $ (unsafeIOToST . decodeChunkToBuffer) =<< A. new (len+ 1 )
297
- where
298
- decodeChunkToBuffer :: A. MArray s -> IO Decoding
299
- decodeChunkToBuffer dest = withForeignPtr fp $ \ ptr ->
300
- with (0 :: CSize ) $ \ destOffPtr ->
301
- with codepoint0 $ \ codepointPtr ->
302
- with state0 $ \ statePtr ->
303
- with nullPtr $ \ curPtrPtr ->
304
- let end = ptr `plusPtr` (off + len)
305
- loop curPtr = do
306
- poke curPtrPtr curPtr
307
- curPtr' <- c_decode_utf8_with_state (A. maBA dest) destOffPtr
308
- curPtrPtr end codepointPtr statePtr
309
- state <- peek statePtr
310
- case state of
311
- UTF8_REJECT -> do
312
- -- We encountered an encoding error
313
- x <- peek curPtr'
314
- poke statePtr 0
315
- case onErr desc (Just x) of
316
- Nothing -> loop $ curPtr' `plusPtr` 1
317
- Just c -> do
318
- destOff <- peek destOffPtr
319
- w <- unsafeSTToIO $
320
- unsafeWrite dest (fromIntegral destOff) (safe c)
321
- poke destOffPtr (destOff + fromIntegral w)
322
- loop $ curPtr' `plusPtr` 1
323
-
324
- _ -> do
325
- -- We encountered the end of the buffer while decoding
326
- n <- peek destOffPtr
327
- codepoint <- peek codepointPtr
328
- chunkText <- unsafeSTToIO $ do
329
- arr <- A. unsafeFreeze dest
330
- return $! text arr 0 (fromIntegral n)
331
- lastPtr <- peek curPtrPtr
332
- let left = lastPtr `minusPtr` curPtr
333
- ! undecoded = case state of
334
- UTF8_ACCEPT -> B. empty
335
- _ -> B. append undecoded0 (B. drop left bs)
336
- return $ Some chunkText undecoded
337
- (decodeChunk undecoded codepoint state)
338
- in loop ( ptr `plusPtr` off)
299
+ decodeChunk undecoded0 codepoint0 state0 bs = withBS bs aux where
300
+ aux fp len = runST $ (unsafeIOToST . decodeChunkToBuffer) =<< A. new (len+ 1 )
301
+ where
302
+ decodeChunkToBuffer :: A. MArray s -> IO Decoding
303
+ decodeChunkToBuffer dest = withForeignPtr fp $ \ ptr ->
304
+ with (0 :: CSize ) $ \ destOffPtr ->
305
+ with codepoint0 $ \ codepointPtr ->
306
+ with state0 $ \ statePtr ->
307
+ with nullPtr $ \ curPtrPtr ->
308
+ let end = ptr `plusPtr` len
309
+ loop curPtr = do
310
+ poke curPtrPtr curPtr
311
+ curPtr' <- c_decode_utf8_with_state (A. maBA dest) destOffPtr
312
+ curPtrPtr end codepointPtr statePtr
313
+ state <- peek statePtr
314
+ case state of
315
+ UTF8_REJECT -> do
316
+ -- We encountered an encoding error
317
+ x <- peek curPtr'
318
+ poke statePtr 0
319
+ case onErr desc (Just x) of
320
+ Nothing -> loop $ curPtr' `plusPtr` 1
321
+ Just c -> do
322
+ destOff <- peek destOffPtr
323
+ w <- unsafeSTToIO $
324
+ unsafeWrite dest (fromIntegral destOff) (safe c)
325
+ poke destOffPtr (destOff + fromIntegral w)
326
+ loop $ curPtr' `plusPtr` 1
327
+
328
+ _ -> do
329
+ -- We encountered the end of the buffer while decoding
330
+ n <- peek destOffPtr
331
+ codepoint <- peek codepointPtr
332
+ chunkText <- unsafeSTToIO $ do
333
+ arr <- A. unsafeFreeze dest
334
+ return $! text arr 0 (fromIntegral n)
335
+ lastPtr <- peek curPtrPtr
336
+ let left = lastPtr `minusPtr` curPtr
337
+ ! undecoded = case state of
338
+ UTF8_ACCEPT -> B. empty
339
+ _ -> B. append undecoded0 (B. drop left bs)
340
+ return $ Some chunkText undecoded
341
+ (decodeChunk undecoded codepoint state)
342
+ in loop ptr
339
343
desc = " Data.Text.Internal.Encoding.streamDecodeUtf8With: Invalid UTF-8 stream"
340
344
341
345
-- | Decode a 'ByteString' containing UTF-8 encoded text that is known
@@ -436,12 +440,12 @@ encodeUtf8 (Text arr off len)
436
440
newDest <- peek destPtr
437
441
let utf8len = newDest `minusPtr` ptr
438
442
if utf8len >= len `shiftR` 1
439
- then return (PS fp 0 utf8len)
443
+ then return (mkBS fp utf8len)
440
444
else do
441
445
fp' <- mallocByteString utf8len
442
446
withForeignPtr fp' $ \ ptr' -> do
443
447
memcpy ptr' ptr (fromIntegral utf8len)
444
- return (PS fp' 0 utf8len)
448
+ return (mkBS fp' utf8len)
445
449
446
450
-- | Decode text from little endian UTF-16 encoding.
447
451
decodeUtf16LEWith :: OnDecodeError -> ByteString -> Text
0 commit comments