Skip to content

Commit b7bff71

Browse files
authored
Merge pull request #296 from phadej/bytestring-0.11
bytestring-0.11
2 parents 7d3130b + 21281e7 commit b7bff71

File tree

10 files changed

+183
-84
lines changed

10 files changed

+183
-84
lines changed

.gitignore

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,7 @@
88
/GNUmakefile
99
/.ghc.environment.*
1010
/cabal.project.local
11+
/cabal.test.project.local
1112

1213
# Test data repo ignored. Please see instruction in tests-and-benchmarks.markdown
1314
/tests/text-test-data/

benchmarks/text-benchmarks.cabal

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -101,6 +101,7 @@ executable text-benchmarks
101101
Data.Text.Internal.Builder.Functions
102102
Data.Text.Internal.Builder.Int.Digits
103103
Data.Text.Internal.Builder.RealFloat.Functions
104+
Data.Text.Internal.ByteStringCompat
104105
Data.Text.Internal.Encoding.Fusion
105106
Data.Text.Internal.Encoding.Fusion.Common
106107
Data.Text.Internal.Encoding.Utf16

cabal.tests.project

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,5 @@
1+
-- this project doesn't have local 'text' package,
2+
-- so tests build faster.
3+
4+
packages: tests
5+
tests: True

scripts/tests.sh

Lines changed: 30 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,30 @@
1+
#!/bin/sh
2+
3+
set -ex
4+
5+
runtest() {
6+
HC=$1
7+
shift
8+
9+
# EDIT last line to pass arguments
10+
11+
cabal run text-tests:test:tests \
12+
--project-file=cabal.tests.project \
13+
--builddir="dist-newstyle/$HC" \
14+
--with-compiler="$HC" \
15+
-- "$@"
16+
}
17+
18+
runtest ghc-8.10.2 "$@"
19+
runtest ghc-8.8.4 "$@"
20+
runtest ghc-8.6.5 "$@"
21+
runtest ghc-8.4.4 "$@"
22+
runtest ghc-8.2.2 "$@"
23+
runtest ghc-8.0.2 "$@"
24+
25+
runtest ghc-7.10.3 "$@"
26+
runtest ghc-7.8.4 "$@"
27+
runtest ghc-7.6.3 "$@"
28+
runtest ghc-7.4.2 "$@"
29+
runtest ghc-7.2.2 "$@"
30+
runtest ghc-7.0.4 "$@"

src/Data/Text/Encoding.hs

Lines changed: 84 additions & 80 deletions
Original file line numberDiff line numberDiff line change
@@ -96,6 +96,7 @@ import qualified Data.Text.Array as A
9696
import qualified Data.Text.Internal.Encoding.Fusion as E
9797
import qualified Data.Text.Internal.Encoding.Utf16 as U16
9898
import qualified Data.Text.Internal.Fusion as F
99+
import Data.Text.Internal.ByteStringCompat
99100

100101
#include "text_cbits.h"
101102

@@ -123,12 +124,13 @@ decodeASCII = decodeUtf8
123124
-- 'decodeLatin1' is semantically equivalent to
124125
-- @Data.Text.pack . Data.ByteString.Char8.unpack@
125126
decodeLatin1 :: ByteString -> Text
126-
decodeLatin1 (PS fp off len) = text a 0 len
127-
where
128-
a = A.run (A.new len >>= unsafeIOToST . go)
129-
go dest = withForeignPtr fp $ \ptr -> do
130-
c_decode_latin1 (A.maBA dest) (ptr `plusPtr` off) (ptr `plusPtr` (off+len))
131-
return dest
127+
decodeLatin1 bs = withBS bs aux where
128+
aux fp len = text a 0 len
129+
where
130+
a = A.run (A.new len >>= unsafeIOToST . go)
131+
go dest = withForeignPtr fp $ \ptr -> do
132+
c_decode_latin1 (A.maBA dest) ptr (ptr `plusPtr` len)
133+
return dest
132134

133135
-- | Decode a 'ByteString' containing UTF-8 encoded text.
134136
--
@@ -139,36 +141,38 @@ decodeLatin1 (PS fp off len) = text a 0 len
139141
-- 'error' (/since 1.2.3.1/); For earlier versions of @text@ using
140142
-- those unsupported code points would result in undefined behavior.
141143
decodeUtf8With :: OnDecodeError -> ByteString -> Text
142-
decodeUtf8With onErr (PS fp off len) = runText $ \done -> do
143-
let go dest = withForeignPtr fp $ \ptr ->
144-
with (0::CSize) $ \destOffPtr -> do
145-
let end = ptr `plusPtr` (off + len)
146-
loop curPtr = do
147-
curPtr' <- c_decode_utf8 (A.maBA dest) destOffPtr curPtr end
148-
if curPtr' == end
149-
then do
150-
n <- peek destOffPtr
151-
unsafeSTToIO (done dest (fromIntegral n))
152-
else do
153-
x <- peek curPtr'
154-
case onErr desc (Just x) of
155-
Nothing -> loop $ curPtr' `plusPtr` 1
156-
Just c
157-
| c > '\xFFFF' -> throwUnsupportedReplChar
158-
| otherwise -> do
159-
destOff <- peek destOffPtr
160-
w <- unsafeSTToIO $
161-
unsafeWrite dest (fromIntegral destOff)
162-
(safe c)
163-
poke destOffPtr (destOff + fromIntegral w)
164-
loop $ curPtr' `plusPtr` 1
165-
loop (ptr `plusPtr` off)
166-
(unsafeIOToST . go) =<< A.new len
144+
decodeUtf8With onErr bs = withBS bs aux
167145
where
168-
desc = "Data.Text.Internal.Encoding.decodeUtf8: Invalid UTF-8 stream"
146+
aux fp len = runText $ \done -> do
147+
let go dest = withForeignPtr fp $ \ptr ->
148+
with (0::CSize) $ \destOffPtr -> do
149+
let end = ptr `plusPtr` len
150+
loop curPtr = do
151+
curPtr' <- c_decode_utf8 (A.maBA dest) destOffPtr curPtr end
152+
if curPtr' == end
153+
then do
154+
n <- peek destOffPtr
155+
unsafeSTToIO (done dest (fromIntegral n))
156+
else do
157+
x <- peek curPtr'
158+
case onErr desc (Just x) of
159+
Nothing -> loop $ curPtr' `plusPtr` 1
160+
Just c
161+
| c > '\xFFFF' -> throwUnsupportedReplChar
162+
| otherwise -> do
163+
destOff <- peek destOffPtr
164+
w <- unsafeSTToIO $
165+
unsafeWrite dest (fromIntegral destOff)
166+
(safe c)
167+
poke destOffPtr (destOff + fromIntegral w)
168+
loop $ curPtr' `plusPtr` 1
169+
loop ptr
170+
(unsafeIOToST . go) =<< A.new len
171+
where
172+
desc = "Data.Text.Internal.Encoding.decodeUtf8: Invalid UTF-8 stream"
169173

170-
throwUnsupportedReplChar = throwIO $
171-
ErrorCall "decodeUtf8With: non-BMP replacement characters not supported"
174+
throwUnsupportedReplChar = throwIO $
175+
ErrorCall "decodeUtf8With: non-BMP replacement characters not supported"
172176
-- TODO: The code currently assumes that the transcoded UTF-16
173177
-- stream is at most twice as long (in bytes) as the input UTF-8
174178
-- stream. To justify this assumption one has to assume that the
@@ -292,50 +296,50 @@ streamDecodeUtf8With onErr = decodeChunk B.empty 0 0
292296
-- potential surrogate pair started in the last buffer
293297
decodeChunk :: ByteString -> CodePoint -> DecoderState -> ByteString
294298
-> Decoding
295-
decodeChunk undecoded0 codepoint0 state0 bs@(PS fp off len) =
296-
runST $ (unsafeIOToST . decodeChunkToBuffer) =<< A.new (len+1)
297-
where
298-
decodeChunkToBuffer :: A.MArray s -> IO Decoding
299-
decodeChunkToBuffer dest = withForeignPtr fp $ \ptr ->
300-
with (0::CSize) $ \destOffPtr ->
301-
with codepoint0 $ \codepointPtr ->
302-
with state0 $ \statePtr ->
303-
with nullPtr $ \curPtrPtr ->
304-
let end = ptr `plusPtr` (off + len)
305-
loop curPtr = do
306-
poke curPtrPtr curPtr
307-
curPtr' <- c_decode_utf8_with_state (A.maBA dest) destOffPtr
308-
curPtrPtr end codepointPtr statePtr
309-
state <- peek statePtr
310-
case state of
311-
UTF8_REJECT -> do
312-
-- We encountered an encoding error
313-
x <- peek curPtr'
314-
poke statePtr 0
315-
case onErr desc (Just x) of
316-
Nothing -> loop $ curPtr' `plusPtr` 1
317-
Just c -> do
318-
destOff <- peek destOffPtr
319-
w <- unsafeSTToIO $
320-
unsafeWrite dest (fromIntegral destOff) (safe c)
321-
poke destOffPtr (destOff + fromIntegral w)
322-
loop $ curPtr' `plusPtr` 1
323-
324-
_ -> do
325-
-- We encountered the end of the buffer while decoding
326-
n <- peek destOffPtr
327-
codepoint <- peek codepointPtr
328-
chunkText <- unsafeSTToIO $ do
329-
arr <- A.unsafeFreeze dest
330-
return $! text arr 0 (fromIntegral n)
331-
lastPtr <- peek curPtrPtr
332-
let left = lastPtr `minusPtr` curPtr
333-
!undecoded = case state of
334-
UTF8_ACCEPT -> B.empty
335-
_ -> B.append undecoded0 (B.drop left bs)
336-
return $ Some chunkText undecoded
337-
(decodeChunk undecoded codepoint state)
338-
in loop (ptr `plusPtr` off)
299+
decodeChunk undecoded0 codepoint0 state0 bs = withBS bs aux where
300+
aux fp len = runST $ (unsafeIOToST . decodeChunkToBuffer) =<< A.new (len+1)
301+
where
302+
decodeChunkToBuffer :: A.MArray s -> IO Decoding
303+
decodeChunkToBuffer dest = withForeignPtr fp $ \ptr ->
304+
with (0::CSize) $ \destOffPtr ->
305+
with codepoint0 $ \codepointPtr ->
306+
with state0 $ \statePtr ->
307+
with nullPtr $ \curPtrPtr ->
308+
let end = ptr `plusPtr` len
309+
loop curPtr = do
310+
poke curPtrPtr curPtr
311+
curPtr' <- c_decode_utf8_with_state (A.maBA dest) destOffPtr
312+
curPtrPtr end codepointPtr statePtr
313+
state <- peek statePtr
314+
case state of
315+
UTF8_REJECT -> do
316+
-- We encountered an encoding error
317+
x <- peek curPtr'
318+
poke statePtr 0
319+
case onErr desc (Just x) of
320+
Nothing -> loop $ curPtr' `plusPtr` 1
321+
Just c -> do
322+
destOff <- peek destOffPtr
323+
w <- unsafeSTToIO $
324+
unsafeWrite dest (fromIntegral destOff) (safe c)
325+
poke destOffPtr (destOff + fromIntegral w)
326+
loop $ curPtr' `plusPtr` 1
327+
328+
_ -> do
329+
-- We encountered the end of the buffer while decoding
330+
n <- peek destOffPtr
331+
codepoint <- peek codepointPtr
332+
chunkText <- unsafeSTToIO $ do
333+
arr <- A.unsafeFreeze dest
334+
return $! text arr 0 (fromIntegral n)
335+
lastPtr <- peek curPtrPtr
336+
let left = lastPtr `minusPtr` curPtr
337+
!undecoded = case state of
338+
UTF8_ACCEPT -> B.empty
339+
_ -> B.append undecoded0 (B.drop left bs)
340+
return $ Some chunkText undecoded
341+
(decodeChunk undecoded codepoint state)
342+
in loop ptr
339343
desc = "Data.Text.Internal.Encoding.streamDecodeUtf8With: Invalid UTF-8 stream"
340344

341345
-- | Decode a 'ByteString' containing UTF-8 encoded text that is known
@@ -436,12 +440,12 @@ encodeUtf8 (Text arr off len)
436440
newDest <- peek destPtr
437441
let utf8len = newDest `minusPtr` ptr
438442
if utf8len >= len `shiftR` 1
439-
then return (PS fp 0 utf8len)
443+
then return (mkBS fp utf8len)
440444
else do
441445
fp' <- mallocByteString utf8len
442446
withForeignPtr fp' $ \ptr' -> do
443447
memcpy ptr' ptr (fromIntegral utf8len)
444-
return (PS fp' 0 utf8len)
448+
return (mkBS fp' utf8len)
445449

446450
-- | Decode text from little endian UTF-16 encoding.
447451
decodeUtf16LEWith :: OnDecodeError -> ByteString -> Text
Lines changed: 55 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,55 @@
1+
{-# LANGUAGE BangPatterns #-}
2+
{-# LANGUAGE CPP #-}
3+
{-# LANGUAGE MagicHash #-}
4+
module Data.Text.Internal.ByteStringCompat (mkBS, withBS) where
5+
6+
import Data.ByteString.Internal (ByteString (..))
7+
import Data.Word (Word8)
8+
import Foreign.ForeignPtr (ForeignPtr)
9+
10+
#if !MIN_VERSION_bytestring(0,11,0)
11+
#if MIN_VERSION_base(4,10,0)
12+
import GHC.ForeignPtr (plusForeignPtr)
13+
#else
14+
import GHC.ForeignPtr (ForeignPtr(ForeignPtr))
15+
import GHC.Types (Int (..))
16+
import GHC.Prim (plusAddr#)
17+
#endif
18+
#endif
19+
20+
mkBS :: ForeignPtr Word8 -> Int -> ByteString
21+
#if MIN_VERSION_bytestring(0,11,0)
22+
mkBS dfp n = BS dfp n
23+
#else
24+
mkBS dfp n = PS dfp 0 n
25+
#endif
26+
{-# INLINE mkBS #-}
27+
28+
withBS :: ByteString -> (ForeignPtr Word8 -> Int -> r) -> r
29+
#if MIN_VERSION_bytestring(0,11,0)
30+
withBS (BS !sfp !slen) kont = kont sfp slen
31+
#else
32+
withBS (PS !sfp !soff !slen) kont = kont (plusForeignPtr sfp soff) slen
33+
#endif
34+
{-# INLINE withBS #-}
35+
36+
#if !MIN_VERSION_bytestring(0,11,0)
37+
#if !MIN_VERSION_base(4,10,0)
38+
-- |Advances the given address by the given offset in bytes.
39+
--
40+
-- The new 'ForeignPtr' shares the finalizer of the original,
41+
-- equivalent from a finalization standpoint to just creating another
42+
-- reference to the original. That is, the finalizer will not be
43+
-- called before the new 'ForeignPtr' is unreachable, nor will it be
44+
-- called an additional time due to this call, and the finalizer will
45+
-- be called with the same address that it would have had this call
46+
-- not happened, *not* the new address.
47+
plusForeignPtr :: ForeignPtr a -> Int -> ForeignPtr b
48+
plusForeignPtr (ForeignPtr addr guts) (I# offset) = ForeignPtr (plusAddr# addr offset) guts
49+
{-# INLINE [0] plusForeignPtr #-}
50+
{-# RULES
51+
"ByteString plusForeignPtr/0" forall fp .
52+
plusForeignPtr fp 0 = fp
53+
#-}
54+
#endif
55+
#endif

src/Data/Text/Internal/Encoding/Fusion.hs

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -53,6 +53,7 @@ import qualified Data.Text.Internal.Encoding.Utf8 as U8
5353
import qualified Data.Text.Internal.Encoding.Utf16 as U16
5454
import qualified Data.Text.Internal.Encoding.Utf32 as U32
5555
import Data.Text.Unsafe (unsafeDupablePerformIO)
56+
import Data.Text.Internal.ByteStringCompat
5657

5758
streamASCII :: ByteString -> Stream Char
5859
streamASCII bs = Stream next 0 (maxSize l)
@@ -185,7 +186,7 @@ unstream (Stream next s0 len) = unsafeDupablePerformIO $ do
185186
withForeignPtr fp' $ \p -> pokeByteOff p off x
186187
loop n' (off+1) s fp'
187188
{-# NOINLINE trimUp #-}
188-
trimUp fp _ off = return $! PS fp 0 off
189+
trimUp fp _ off = return $! mkBS fp off
189190
copy0 :: ForeignPtr Word8 -> Int -> Int -> IO (ForeignPtr Word8)
190191
copy0 !src !srcLen !destLen =
191192
#if defined(ASSERTS)

src/Data/Text/Internal/Lazy/Encoding/Fusion.hs

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -52,7 +52,7 @@ import Data.ByteString.Internal (mallocByteString, memcpy)
5252
#if defined(ASSERTS)
5353
import Control.Exception (assert)
5454
#endif
55-
import qualified Data.ByteString.Internal as B
55+
import Data.Text.Internal.ByteStringCompat
5656

5757
data S = S0
5858
| S1 {-# UNPACK #-} !Word8
@@ -297,7 +297,7 @@ unstreamChunks chunkSize (Stream next s0 len0) = chunk s0 (upperBound 4 len0)
297297
fp' <- copy0 fp n n'
298298
withForeignPtr fp' $ \p -> pokeByteOff p off x
299299
loop n' (off+1) s fp'
300-
trimUp fp off = B.PS fp 0 off
300+
trimUp fp off = mkBS fp off
301301
copy0 :: ForeignPtr Word8 -> Int -> Int -> IO (ForeignPtr Word8)
302302
copy0 !src !srcLen !destLen =
303303
#if defined(ASSERTS)

tests/text-tests.cabal

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -121,6 +121,7 @@ test-suite tests
121121
Data.Text.Internal.Builder.Functions
122122
Data.Text.Internal.Builder.Int.Digits
123123
Data.Text.Internal.Builder.RealFloat.Functions
124+
Data.Text.Internal.ByteStringCompat
124125
Data.Text.Internal.Encoding.Fusion
125126
Data.Text.Internal.Encoding.Fusion.Common
126127
Data.Text.Internal.Encoding.Utf16

text.cabal

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -120,6 +120,7 @@ library
120120
Data.Text.Internal.Builder.Functions
121121
Data.Text.Internal.Builder.Int.Digits
122122
Data.Text.Internal.Builder.RealFloat.Functions
123+
Data.Text.Internal.ByteStringCompat
123124
Data.Text.Internal.Encoding.Fusion
124125
Data.Text.Internal.Encoding.Fusion.Common
125126
Data.Text.Internal.Encoding.Utf16
@@ -168,7 +169,7 @@ library
168169
build-depends: bytestring >= 0.9 && < 0.10.4,
169170
bytestring-builder >= 0.10.4.0.2 && < 0.11
170171
else
171-
build-depends: bytestring >= 0.10.4 && < 0.11
172+
build-depends: bytestring >= 0.10.4 && < 0.12
172173

173174
ghc-options: -Wall -fwarn-tabs -funbox-strict-fields -O2
174175
if flag(developer)

0 commit comments

Comments
 (0)