Skip to content

Commit 9086b60

Browse files
committed
Further tuneup of cstring{,Utf8}
Moved to Data.ByteString.Builder.Internal, as these no longer have anything to do with 'BoundedPrim', and can benefit from supporting internal code in their new home.
1 parent 266d6da commit 9086b60

File tree

7 files changed

+102
-115
lines changed

7 files changed

+102
-115
lines changed

Data/ByteString/Builder/Internal.hs

Lines changed: 77 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
{-# LANGUAGE ScopedTypeVariables, CPP, BangPatterns, RankNTypes, TupleSections #-}
2-
{-# LANGUAGE Unsafe #-}
2+
{-# LANGUAGE MagicHash, ViewPatterns, Unsafe #-}
33
{-# OPTIONS_HADDOCK not-home #-}
44
-- | Copyright : (c) 2010 - 2011 Simon Meier
55
-- License : BSD3-style (see LICENSE)
@@ -86,6 +86,8 @@ module Data.ByteString.Builder.Internal (
8686
, byteStringCopy
8787
, byteStringInsert
8888
, byteStringThreshold
89+
, cstring
90+
, cstringUtf8
8991

9092
, lazyByteStringCopy
9193
, lazyByteStringInsert
@@ -127,6 +129,7 @@ module Data.ByteString.Builder.Internal (
127129
) where
128130

129131
import Control.Arrow (second)
132+
import Control.Monad (when)
130133

131134
#if !(MIN_VERSION_base(4,11,0))
132135
import Data.Semigroup (Semigroup((<>)))
@@ -140,10 +143,12 @@ import qualified Data.ByteString.Short.Internal as Sh
140143
import qualified GHC.IO.Buffer as IO (Buffer(..), newByteBuffer)
141144
import GHC.IO.Handle.Internals (wantWritableHandle, flushWriteBuffer)
142145
import GHC.IO.Handle.Types (Handle__, haByteBuffer, haBufferMode)
146+
import GHC.Exts
143147
import System.IO (hFlush, BufferMode(..), Handle)
144148
import Data.IORef
145149

146150
import Foreign
151+
import Foreign.C.String (CString)
147152
import Foreign.ForeignPtr.Unsafe (unsafeForeignPtrToPtr)
148153
import System.IO.Unsafe (unsafeDupablePerformIO)
149154

@@ -857,6 +862,77 @@ byteStringInsert :: S.ByteString -> Builder
857862
byteStringInsert =
858863
\bs -> builder $ \k (BufferRange op _) -> return $ insertChunk op bs k
859864

865+
866+
------------------------------------------------------------------------------
867+
-- Raw CString encoding
868+
------------------------------------------------------------------------------
869+
870+
-- | A null-terminated ASCII encoded 'Foreign.C.String.CString'.
871+
-- Null characters are not representable.
872+
--
873+
-- @since 0.11.5.0
874+
{-# INLINABLE cstring #-}
875+
cstring :: Addr# -> Builder
876+
cstring = \addr -> builder $ \k br -> do
877+
let ip = Ptr addr
878+
#if __GLASGOW_HASKELL__ >= 811
879+
ipe = Ptr (addr `plusAddr#` (cstringLength# addr))
880+
#else
881+
!ipe <- plusPtr ip . fromIntegral <$> S.c_strlen ip
882+
#endif
883+
wrappedBytesCopyStep (BufferRange ip ipe) k br
884+
885+
-- | A null-terminated UTF-8 encoded 'Foreign.C.String.CString'.
886+
-- Null characters can be encoded as @0xc0 0x80@.
887+
--
888+
-- @since 0.11.5.0
889+
cstringUtf8 :: Addr# -> Builder
890+
cstringUtf8 = \addr0 -> builder $ \k br -> do
891+
#if __GLASGOW_HASKELL__ >= 811
892+
let len = cstringLength# addr0
893+
#else
894+
(I# len) <- fromIntegral <$> c_strlen (Ptr addr0)
895+
#endif
896+
nullAt <- c_strstr (Ptr addr0) (Ptr "\xc0\x80"#)
897+
cstringUtf8_step addr0 len nullAt k br
898+
{-# INLINABLE cstringUtf8 #-}
899+
900+
cstringUtf8_step :: Addr# -> Int# -> Ptr Word8 -> BuildStep r -> BuildStep r
901+
cstringUtf8_step addr len ((== nullPtr) -> True) k br =
902+
-- Contains no encoded nulls, use simple copy codepath
903+
wrappedBytesCopyStep (BufferRange ip ipe) k br
904+
where
905+
ip = Ptr addr
906+
ipe = Ptr (addr `plusAddr#` len)
907+
cstringUtf8_step addr len !nullAt k (BufferRange op0 ope)
908+
-- Copy as much of the null-free portion of the string as fits into the
909+
-- available buffer space. If the string is long enough, we may have asked
910+
-- for less than its full length, filling the buffer with the rest will go
911+
-- into the next builder step.
912+
| avail > nullFree = do
913+
when (nullFree > 0) (S.memcpy op0 (Ptr addr) nullFree)
914+
pokeElemOff op0 nullFree 0
915+
let !op' = op0 `plusPtr` (nullFree + 1)
916+
nread# = nullFree# +# 2#
917+
addr' = addr `plusAddr#` nread#
918+
len' = len -# nread#
919+
nullAt' <- c_strstr (Ptr addr') (Ptr "\xc0\x80"#)
920+
cstringUtf8_step addr' len' nullAt' k (BufferRange op' ope)
921+
| otherwise = do
922+
let !copy@(I# copy#) = min avail nullFree
923+
when (copy > 0) (S.memcpy op0 (Ptr addr) copy)
924+
let !op' = op0 `plusPtr` copy
925+
addr' = addr `plusAddr#` copy#
926+
len' = len -# copy#
927+
return $ bufferFull 1 op' (cstringUtf8_step addr' len' nullAt k)
928+
where
929+
!avail = ope `minusPtr` op0
930+
!nullFree@(I# nullFree#) = nullAt `minusPtr` (Ptr addr)
931+
932+
foreign import ccall unsafe "string.h strstr" c_strstr
933+
:: CString -> CString -> IO (Ptr Word8)
934+
935+
860936
-- Short bytestrings
861937
------------------------------------------------------------------------------
862938

Data/ByteString/Builder/Prim.hs

Lines changed: 4 additions & 94 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
{-# LANGUAGE BangPatterns, CPP, ScopedTypeVariables #-}
1+
{-# LANGUAGE BangPatterns, ScopedTypeVariables #-}
22
{-# LANGUAGE MagicHash, UnboxedTuples, PatternGuards #-}
33
{-# OPTIONS_GHC -fno-warn-unused-imports #-}
44
{-# LANGUAGE Trustworthy #-}
@@ -433,8 +433,8 @@ module Data.ByteString.Builder.Prim (
433433
-- a decimal number with UTF-8 encoded characters.
434434
, charUtf8
435435

436-
, cstring
437-
, cstringUtf8
436+
, cstring -- Backwards-compatibility re-exports from Internal.hs
437+
, cstringUtf8 -- these no longer make use of the BoundPrim API.
438438

439439
{-
440440
-- * Testing support
@@ -468,6 +468,7 @@ import Data.ByteString.Builder.Prim.ASCII
468468

469469
import Foreign
470470
import Foreign.C.Types
471+
import Foreign.C.String (CString)
471472
import Foreign.ForeignPtr.Unsafe (unsafeForeignPtrToPtr)
472473
import GHC.Int (Int (..))
473474
import GHC.Word (Word8 (..))
@@ -664,97 +665,6 @@ primMapLazyByteStringBounded w =
664665
L.foldrChunks (\x b -> primMapByteStringBounded w x `mappend` b) mempty
665666

666667

667-
------------------------------------------------------------------------------
668-
-- Raw CString encoding
669-
------------------------------------------------------------------------------
670-
671-
-- | A null-terminated ASCII encoded 'Foreign.C.String.CString'.
672-
-- Null characters are not representable.
673-
--
674-
-- @since 0.11.0.0
675-
cstring :: Addr# -> Builder
676-
cstring = \addr0 -> builder $ \k br -> do
677-
#if __GLASGOW_HASKELL__ >= 811
678-
let len = cstringLength# addr0
679-
#else
680-
(I# len) <- fromIntegral <$> S.c_strlen (Ptr addr0)
681-
#endif
682-
cstring_step addr0 len k br
683-
{-# INLINE cstring #-}
684-
685-
cstring_step :: Addr# -> Int# -> BuildStep r -> BuildStep r
686-
cstring_step !addr !len !k br@(BufferRange op0 ope)
687-
-- String is empty, process the continuation
688-
| (I# len) == 0 = k br
689-
-- Buffer is full, allocate some more... We ask for just one more
690-
-- byte, but the builder allocation strategy will in practice give
691-
-- us more space, which we'll consume in a single step.
692-
| op0 == ope =
693-
return $ bufferFull 1 op0 (cstring_step addr len k)
694-
-- Copy as much of the string as fits into the available buffer space.
695-
-- If the string is long enough, we may have asked for less than its
696-
-- full length, filling the buffer with the rest will go into the next
697-
-- builder step.
698-
| otherwise = do
699-
let !avail@(I# avail#) = min (I# len) (ope `minusPtr` op0)
700-
br' = BufferRange (op0 `plusPtr` avail) ope
701-
addr' = addr `plusAddr#` avail#
702-
len' = len -# avail#
703-
S.memcpy op0 (Ptr addr) avail
704-
cstring_step addr' len' k br'
705-
706-
-- | A null-terminated UTF-8 encoded 'Foreign.C.String.CString'.
707-
-- Null characters can be encoded as @0xc0 0x80@.
708-
--
709-
-- @since 0.11.0.0
710-
cstringUtf8 :: Addr# -> Builder
711-
cstringUtf8 = \addr0 -> builder $ \k br -> do
712-
#if __GLASGOW_HASKELL__ >= 811
713-
let len = cstringLength# addr0
714-
#else
715-
(I# len) <- fromIntegral <$> S.c_strlen (Ptr addr0)
716-
#endif
717-
nullAt <- S.c_strstr (Ptr addr0) (Ptr "\xc0\x80"#)
718-
cstringUtf8_step addr0 len nullAt k br
719-
{-# INLINE cstringUtf8 #-}
720-
721-
cstringUtf8_step :: Addr# -> Int# -> Ptr Word8 -> BuildStep r -> BuildStep r
722-
cstringUtf8_step !addr !len !nullAt !k br@(BufferRange op0@(Ptr op0#) ope)
723-
-- String is empty, process the continuation
724-
| (I# len) == 0 = k br
725-
-- Contains no encoded nulls, use simpler 'cstring' code
726-
| nullPtr == nullAt =
727-
cstring_step addr len k br
728-
-- Buffer is full, allocate some more... We ask for just one more
729-
-- byte, but the builder allocation strategy will in practice give
730-
-- us more space, which we'll consume in a single step.
731-
| op0 == ope =
732-
return $ bufferFull 1 op0 (cstringUtf8_step addr len nullAt k)
733-
-- We're at the encoded null-byte, append a '\0' to the buffer and
734-
-- continue with the rest of the input string, after locating the
735-
-- next encoded null-byte, if any.
736-
| (Ptr addr) == nullAt = do
737-
let !(W8# nullByte#) = 0
738-
IO $ \s -> case writeWord8OffAddr# op0# 0# nullByte# s of
739-
s' -> (# s', () #)
740-
let br' = BufferRange (op0 `plusPtr` 1) ope
741-
addr' = addr `plusAddr#` 2#
742-
len' = len -# 2#
743-
nullAt' <- S.c_strstr (Ptr addr') (Ptr "\xc0\x80"#)
744-
cstringUtf8_step addr' len' nullAt' k br'
745-
-- Copy as much of the null-free portion of the string as fits into the
746-
-- available buffer space. If the string is long enough, we may have asked
747-
-- for less than its full length, filling the buffer with the rest will go
748-
-- into the next builder step.
749-
| otherwise = do
750-
let !nullFree = nullAt `minusPtr` (Ptr addr)
751-
!avail@(I# avail#) = min nullFree (ope `minusPtr` op0)
752-
br' = BufferRange (op0 `plusPtr` avail) ope
753-
addr' = addr `plusAddr#` avail#
754-
len' = len -# avail#
755-
S.memcpy op0 (Ptr addr) avail
756-
cstringUtf8_step addr' len' nullAt k br'
757-
758668
------------------------------------------------------------------------------
759669
-- Char8 encoding
760670
------------------------------------------------------------------------------

Data/ByteString/Internal.hs

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -65,7 +65,6 @@ module Data.ByteString.Internal (
6565

6666
-- * Standard C Functions
6767
c_strlen,
68-
c_strstr,
6968
c_free_finalizer,
7069

7170
memchr,

Data/ByteString/Internal/Type.hs

Lines changed: 0 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -82,7 +82,6 @@ module Data.ByteString.Internal.Type (
8282

8383
-- * Standard C Functions
8484
c_strlen,
85-
c_strstr,
8685
c_free_finalizer,
8786

8887
memchr,
@@ -1002,9 +1001,6 @@ accursedUnutterablePerformIO (IO m) = case m realWorld# of (# _, r #) -> r
10021001
foreign import ccall unsafe "string.h strlen" c_strlen
10031002
:: CString -> IO CSize
10041003

1005-
foreign import ccall unsafe "string.h strstr" c_strstr
1006-
:: CString -> CString -> IO (Ptr Word8)
1007-
10081004
foreign import ccall unsafe "static stdlib.h &free" c_free_finalizer
10091005
:: FunPtr (Ptr Word8 -> IO ())
10101006

bench/BenchAll.hs

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -259,6 +259,8 @@ main = do
259259
, benchB' "UTF-8 String" () $ \() -> P.cstringUtf8 "hello world\0"#
260260
, benchB' "String (naive)" "hello world!" fromString
261261
, benchB' "String" () $ \() -> P.cstring "hello world!"#
262+
, benchB' "AsciiLit64" () $ \() -> P.cstring "xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx"#
263+
, benchB' "Utf8Lit64" () $ \() -> P.cstringUtf8 "xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx\xc0\x80xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx"#
262264
]
263265

264266
, bgroup "Encoding wrappers"

tests/builder/Data/ByteString/Builder/Prim/Tests.hs

Lines changed: 1 addition & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -24,20 +24,7 @@ import Test.Tasty.QuickCheck
2424

2525
tests :: [TestTree]
2626
tests = concat [ testsBinary, testsASCII, testsChar8, testsUtf8
27-
, testsCombinatorsB, [testCString, testCStringUtf8] ]
28-
29-
testCString :: TestTree
30-
testCString = testProperty "cstring" $
31-
toLazyByteString (BP.cstring "hello world!"#) ==
32-
LC.pack "hello" `L.append` L.singleton 0x20 `L.append` LC.pack "world!"
33-
34-
testCStringUtf8 :: TestTree
35-
testCStringUtf8 = testProperty "cstringUtf8" $
36-
toLazyByteString (BP.cstringUtf8 "hello\xc0\x80world\xc0\x80!"#) ==
37-
LC.pack "hello" `L.append` L.singleton 0x00
38-
`L.append` LC.pack "world"
39-
`L.append` L.singleton 0x00
40-
`L.append` LC.singleton '!'
27+
, testsCombinatorsB ]
4128

4229
------------------------------------------------------------------------------
4330
-- Binary

tests/builder/Data/ByteString/Builder/Tests.hs

Lines changed: 18 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -36,6 +36,7 @@ import Data.Word
3636
import qualified Data.ByteString as S
3737
import qualified Data.ByteString.Internal as S
3838
import qualified Data.ByteString.Lazy as L
39+
import qualified Data.ByteString.Lazy.Char8 as LC
3940
import qualified Data.ByteString.Short as Sh
4041

4142
import Data.ByteString.Builder
@@ -73,7 +74,8 @@ tests =
7374
testsASCII ++
7475
testsFloating ++
7576
testsChar8 ++
76-
testsUtf8
77+
testsUtf8 ++
78+
testCString
7779

7880

7981
------------------------------------------------------------------------------
@@ -988,3 +990,18 @@ testsUtf8 =
988990
[ testBuilderConstr "charUtf8" charUtf8_list charUtf8
989991
, testBuilderConstr "stringUtf8" (foldMap charUtf8_list) stringUtf8
990992
]
993+
994+
testCString :: [TestTree]
995+
testCString =
996+
[ testProperty "cstring" $
997+
toLazyByteString (BI.cstring "hello world!"#) ==
998+
LC.pack "hello" `L.append` L.singleton 0x20
999+
`L.append` LC.pack "world!"
1000+
, testProperty "cstringUtf8" $
1001+
toLazyByteString (BI.cstringUtf8 "hello\xc0\x80\xc0\x80world\xc0\x80!"#) ==
1002+
LC.pack "hello" `L.append` L.singleton 0x00
1003+
`L.append` L.singleton 0x00
1004+
`L.append` LC.pack "world"
1005+
`L.append` L.singleton 0x00
1006+
`L.append` LC.singleton '!'
1007+
]

0 commit comments

Comments
 (0)