Skip to content

Commit 50959fc

Browse files
committed
Further simplification
Compute the cstring literal's length in the caller, with the builder functions now taking boxed arguments Performance is the same, and the code is cleaner.
1 parent 4f0bdac commit 50959fc

File tree

7 files changed

+77
-64
lines changed

7 files changed

+77
-64
lines changed

Data/ByteString/Builder.hs

Lines changed: 14 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -263,12 +263,14 @@ import qualified Data.ByteString.Builder.Prim as P
263263
import qualified Data.ByteString.Lazy.Internal as L
264264
import Data.ByteString.Builder.ASCII
265265
import Data.ByteString.Builder.RealFloat
266+
import Data.ByteString.Internal (byteCountLiteral)
266267

267268
import Data.String (IsString(..))
268269
import System.IO (Handle, IOMode(..), withBinaryFile)
269270
import Foreign
270271
import GHC.Base (unpackCString#, unpackCStringUtf8#,
271272
unpackFoldrCString#, build)
273+
import GHC.Ptr (Ptr(..))
272274

273275
-- | Execute a 'Builder' and return the generated chunks as a lazy 'L.ByteString'.
274276
-- The work is performed lazy, i.e., only when a chunk of the lazy 'L.ByteString'
@@ -440,18 +442,20 @@ char8 :: Char -> Builder
440442
char8 = P.primFixed P.char8
441443

442444
-- | Char8 encode a 'String'.
443-
{-# INLINE [1] string8 #-} -- phased to allow cstringLiteral rewrite
445+
{-# INLINE [1] string8 #-} -- phased to allow literal cstring rewrites
444446
string8 :: String -> Builder
445447
string8 = P.primMapListFixed P.char8
446448

447449
-- GHC desugars string literals with unpackCString# which the simplifier tends
448450
-- to promptly turn into build (unpackFoldrCString# s), so we match on both.
449451
{-# RULES
450452
"string8/unpackCString#" forall s.
451-
string8 (unpackCString# s) = cstringLiteral s
453+
string8 (unpackCString# s) =
454+
ascLiteralCopy (Ptr s) (byteCountLiteral s)
452455

453456
"string8/unpackFoldrCString#" forall s.
454-
string8 (build (unpackFoldrCString# s)) = cstringLiteral s
457+
string8 (build (unpackFoldrCString# s)) =
458+
ascLiteralCopy (Ptr s) (byteCountLiteral s)
455459
#-}
456460

457461
------------------------------------------------------------------------------
@@ -467,19 +471,22 @@ charUtf8 = P.primBounded P.charUtf8
467471
--
468472
-- Note that 'stringUtf8' performs no codepoint validation and consequently may
469473
-- emit invalid UTF-8 if asked (e.g. single surrogates).
470-
{-# INLINE [1] stringUtf8 #-} -- phased to allow cstringLiteral rewrite
474+
{-# INLINE [1] stringUtf8 #-} -- phased to allow literal cstring rewrites
471475
stringUtf8 :: String -> Builder
472476
stringUtf8 = P.primMapListBounded P.charUtf8
473477

474478
{-# RULES
475479
"stringUtf8/unpackCStringUtf8#" forall s.
476-
stringUtf8 (unpackCStringUtf8# s) = cstringUtf8Literal s
480+
stringUtf8 (unpackCStringUtf8# s) =
481+
modUtf8LitCopy (Ptr s) (byteCountLiteral s)
477482

478483
"stringUtf8/unpackCString#" forall s.
479-
stringUtf8 (unpackCString# s) = cstringLiteral s
484+
stringUtf8 (unpackCString# s) =
485+
ascLiteralCopy (Ptr s) (byteCountLiteral s)
480486

481487
"stringUtf8/unpackFoldrCString#" forall s.
482-
stringUtf8 (build (unpackFoldrCString# s)) = cstringLiteral s
488+
stringUtf8 (build (unpackFoldrCString# s)) =
489+
ascLiteralCopy (Ptr s) (byteCountLiteral s)
483490
#-}
484491

485492
instance IsString Builder where

Data/ByteString/Builder/Internal.hs

Lines changed: 26 additions & 36 deletions
Original file line numberDiff line numberDiff line change
@@ -84,10 +84,10 @@ module Data.ByteString.Builder.Internal (
8484
-- , sizedChunksInsert
8585

8686
, byteStringCopy
87+
, ascLiteralCopy
88+
, modUtf8LitCopy
8789
, byteStringInsert
8890
, byteStringThreshold
89-
, cstringLiteral
90-
, cstringUtf8Literal
9191

9292
, lazyByteStringCopy
9393
, lazyByteStringInsert
@@ -872,62 +872,52 @@ byteStringInsert =
872872
-- @0xC0 0x80@) null characters.
873873
--
874874
-- @since 0.11.5.0
875-
{-# INLINABLE cstringLiteral #-}
876-
cstringLiteral :: Addr# -> Builder
877-
cstringLiteral = \addr -> builder $ \k br -> do
878-
let ip = Ptr addr
879-
#if __GLASGOW_HASKELL__ >= 811
880-
ipe = Ptr (addr `plusAddr#` (cstringLength# addr))
881-
#else
882-
!ipe <- plusPtr ip . fromIntegral <$> S.c_strlen ip
883-
#endif
875+
{-# INLINABLE ascLiteralCopy #-}
876+
ascLiteralCopy :: Ptr Word8 -> Int -> Builder
877+
ascLiteralCopy = \ !ip !len -> builder $ \k br -> do
878+
let !ipe = ip `plusPtr` len
884879
wrappedBytesCopyStep (BufferRange ip ipe) k br
885880

886881
-- | Builder for raw 'Addr#' pointers to null-terminated primitive UTF-8
887882
-- encoded strings that may contain embedded overlong-encodings (as the
888883
-- two-byte sequence @0xC0 0x80@) of null characters.
889884
--
890885
-- @since 0.11.5.0
891-
{-# INLINABLE cstringUtf8Literal #-}
892-
cstringUtf8Literal :: Addr# -> Builder
893-
cstringUtf8Literal = \addr0 -> builder $ \k br -> do
894-
#if __GLASGOW_HASKELL__ >= 811
895-
let len = I# (cstringLength# addr0)
896-
#else
897-
len <- fromIntegral <$> S.c_strlen (Ptr addr0)
898-
#endif
899-
nullAt <- c_strstr (Ptr addr0) (Ptr "\xc0\x80"#)
900-
cstringUtf8_step (Ptr addr0) len nullAt k br
901-
902-
cstringUtf8_step :: Ptr Word8-> Int -> Ptr Word8 -> BuildStep r -> BuildStep r
903-
cstringUtf8_step !ip !len ((== nullPtr) -> True) k br =
886+
{-# INLINABLE modUtf8LitCopy #-}
887+
modUtf8LitCopy :: Ptr Word8 -> Int -> Builder
888+
modUtf8LitCopy = \ !ip !len -> builder $ \k br -> do
889+
nullAt <- c_strstr (castPtr ip) (Ptr "\xc0\x80"#)
890+
modUtf8_step ip len nullAt k br
891+
892+
modUtf8_step :: Ptr Word8 -> Int -> Ptr Word8 -> BuildStep r -> BuildStep r
893+
modUtf8_step !ip !len ((== nullPtr) -> True) k br =
904894
-- Contains no encoded nulls, use simple copy codepath
905895
wrappedBytesCopyStep (BufferRange ip ipe) k br
906896
where
907897
!ipe = ip `plusPtr` len
908-
cstringUtf8_step !ip !len !nullAt k (BufferRange op0 ope)
898+
modUtf8_step !ip !len !nullAt k (BufferRange op0 ope)
909899
-- Copy as much of the null-free portion of the string as fits into the
910900
-- available buffer space. If the string is long enough, we may have asked
911901
-- for less than its full length, filling the buffer with the rest will go
912902
-- into the next builder step.
913903
| avail > nullFree = do
914-
when (nullFree > 0) (S.memcpy op0 ip nullFree)
904+
when (nullFree > 0) (copyBytes op0 ip nullFree)
915905
pokeElemOff op0 nullFree 0
916-
let !op' = op0 `plusPtr` (nullFree + 1)
917-
nread = nullFree + 2
918-
!ip' = ip `plusPtr` nread
919-
len' = len - nread
906+
let used = nullFree + 2
907+
len' = len - used
908+
!ip' = ip `plusPtr` used
909+
!op' = op0 `plusPtr` (nullFree + 1)
920910
nullAt' <- c_strstr ip' (Ptr "\xc0\x80"#)
921-
cstringUtf8_step ip' len' nullAt' k (BufferRange op' ope)
911+
modUtf8_step ip' len' nullAt' k (BufferRange op' ope)
922912
| avail > 0 = do
923913
-- avail <= nullFree
924-
S.memcpy op0 ip avail
925-
let !op' = op0 `plusPtr` avail
914+
copyBytes op0 ip avail
915+
let len' = len - avail
926916
!ip' = ip `plusPtr` avail
927-
len' = len - avail
928-
return $ bufferFull 1 op' (cstringUtf8_step ip' len' nullAt k)
917+
!op' = op0 `plusPtr` avail
918+
return $ bufferFull 1 op' (modUtf8_step ip' len' nullAt k)
929919
| otherwise =
930-
return $ bufferFull 1 op0 (cstringUtf8_step ip len nullAt k)
920+
return $ bufferFull 1 op0 (modUtf8_step ip len nullAt k)
931921
where
932922
!avail = ope `minusPtr` op0
933923
!nullFree = nullAt `minusPtr` ip

Data/ByteString/Builder/Prim.hs

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -668,19 +668,19 @@ primMapLazyByteStringBounded w =
668668
-- @0xC0 0x80@) null characters.
669669
--
670670
-- @since 0.11.0.0
671-
{-# DEPRECATED cstring "Use cstringLiteral instead" #-}
671+
{-# DEPRECATED cstring "Use ascLiteralCopy instead" #-}
672672
cstring :: Addr# -> Builder
673-
cstring = cstringLiteral
673+
cstring s = ascLiteralCopy (Ptr s) (S.byteCountLiteral s)
674674
{-# INLINE cstring #-}
675675

676676
-- | Builder for raw 'Addr#' pointers to null-terminated primitive UTF-8
677677
-- encoded strings that may contain embedded overlong-encodings (as the
678678
-- two-byte sequence @0xC0 0x80@) of null characters.
679679
--
680680
-- @since 0.11.0.0
681-
{-# DEPRECATED cstringUtf8 "Use cstringUtf8Literal instead" #-}
681+
{-# DEPRECATED cstringUtf8 "Use modUtf8LitCopy instead" #-}
682682
cstringUtf8 :: Addr# -> Builder
683-
cstringUtf8 = cstringUtf8Literal
683+
cstringUtf8 s = modUtf8LitCopy (Ptr s) (S.byteCountLiteral s)
684684
{-# INLINE cstringUtf8 #-}
685685

686686
------------------------------------------------------------------------------

Data/ByteString/Internal.hs

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -36,7 +36,7 @@ module Data.ByteString.Internal (
3636
unpackBytes, unpackAppendBytesLazy, unpackAppendBytesStrict,
3737
unpackChars, unpackAppendCharsLazy, unpackAppendCharsStrict,
3838
unsafePackAddress, unsafePackLenAddress,
39-
unsafePackLiteral, unsafePackLenLiteral,
39+
unsafePackLiteral, unsafePackLenLiteral, byteCountLiteral,
4040

4141
-- * Low level imperative construction
4242
empty,

Data/ByteString/Internal/Type.hs

Lines changed: 14 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -39,7 +39,7 @@ module Data.ByteString.Internal.Type (
3939
unpackBytes, unpackAppendBytesLazy, unpackAppendBytesStrict,
4040
unpackChars, unpackAppendCharsLazy, unpackAppendCharsStrict,
4141
unsafePackAddress, unsafePackLenAddress,
42-
unsafePackLiteral, unsafePackLenLiteral,
42+
unsafePackLiteral, unsafePackLenLiteral, byteCountLiteral,
4343

4444
-- * Low level imperative construction
4545
empty,
@@ -436,13 +436,22 @@ unsafePackLenAddress len addr# = do
436436
-- @since 0.11.1.0
437437
unsafePackLiteral :: Addr# -> ByteString
438438
unsafePackLiteral addr# =
439+
unsafePackLenLiteral (byteCountLiteral addr#) addr#
440+
{-# INLINE unsafePackLiteral #-}
441+
442+
-- | Byte count of null-terminated primitive literal string excluding the
443+
-- terminating null byte.
444+
byteCountLiteral :: Addr# -> Int
445+
byteCountLiteral addr# =
439446
#if __GLASGOW_HASKELL__ >= 811
440-
unsafePackLenLiteral (I# (cstringLength# addr#)) addr#
447+
I# (cstringLength# addr#)
441448
#else
442-
let len = accursedUnutterablePerformIO (c_strlen (Ptr addr#))
443-
in unsafePackLenLiteral (fromIntegral len) addr#
449+
fromIntegral (pure_strlen (Ptr addr#))
450+
451+
foreign import ccall unsafe "string.h strlen" pure_strlen
452+
:: CString -> CSize
444453
#endif
445-
{-# INLINE unsafePackLiteral #-}
454+
{-# INLINE byteCountLiteral #-}
446455

447456

448457
-- | See 'unsafePackLiteral'. This function is similar,

bench/BenchAll.hs

Lines changed: 11 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -25,6 +25,7 @@ import qualified Data.ByteString as S
2525
import qualified Data.ByteString.Char8 as S8
2626
import qualified Data.ByteString.Lazy as L
2727
import qualified Data.ByteString.Lazy.Char8 as L8
28+
import Data.ByteString.Internal (byteCountLiteral)
2829

2930
import Data.ByteString.Builder
3031
import Data.ByteString.Builder.Extra (byteStringCopy,
@@ -250,12 +251,13 @@ largeTraversalInput = S.concat (replicate 10 byteStringData)
250251
smallTraversalInput :: S.ByteString
251252
smallTraversalInput = S8.pack "The quick brown fox"
252253

253-
asciiLit, utf8Lit :: Ptr Word8
254-
asciiLit = Ptr "xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx"#
255-
utf8Lit = Ptr "xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx\xc0\x80xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx"#
254+
ascBuf, utfBuf :: Ptr Word8
255+
ascBuf = Ptr "xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx"#
256+
utfBuf = Ptr "xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx\xc0\x80xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx"#
256257

257-
addrOf :: Ptr Word8 -> Addr#
258-
addrOf (Ptr addr) = addr
258+
asclit, utflit :: Ptr Word8 -> Builder
259+
asclit str@(Ptr addr) = BI.ascLiteralCopy str (byteCountLiteral addr)
260+
utflit str@(Ptr addr) = BI.modUtf8LitCopy str (byteCountLiteral addr)
259261

260262
main :: IO ()
261263
main = do
@@ -266,11 +268,11 @@ main = do
266268
, benchB' "ensureFree 8" () (const (ensureFree 8))
267269
, benchB' "intHost 1" 1 intHost
268270
, benchB' "UTF-8 String (naive)" "hello world\0" fromString
269-
, benchB' "UTF-8 String" () $ \() -> BI.cstringUtf8Literal "hello world\xc0\x80"#
271+
, benchB' "UTF-8 String" () $ \() -> utflit (Ptr "hello world\xc0\x80"#)
270272
, benchB' "String (naive)" "hello world!" fromString
271-
, benchB' "String" () $ \() -> BI.cstringLiteral "hello world!"#
272-
, benchB' "AsciiLit" () $ \() -> BI.cstringLiteral (addrOf asciiLit)
273-
, benchB' "Utf8Lit" () $ \() -> BI.cstringUtf8Literal (addrOf utf8Lit)
273+
, benchB' "String" () $ \() -> asclit (Ptr "hello world!"#)
274+
, benchB' "AsciiLit" () $ \() -> asclit ascBuf
275+
, benchB' "Utf8Lit" () $ \() -> utflit utfBuf
274276
]
275277

276278
, bgroup "Encoding wrappers"

tests/builder/Data/ByteString/Builder/Tests.hs

Lines changed: 7 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -50,6 +50,7 @@ import Control.Exception (evaluate)
5050
import System.IO (openTempFile, hPutStr, hClose, hSetBinaryMode, hSetEncoding, utf8, hSetNewlineMode, noNewlineTranslation)
5151
import Foreign (ForeignPtr, withForeignPtr, castPtr)
5252
import Foreign.C.String (withCString)
53+
import GHC.Ptr (Ptr(..))
5354
import Numeric (showFFloat)
5455
import System.Posix.Internals (c_unlink)
5556

@@ -994,14 +995,18 @@ testsUtf8 =
994995
testCString :: [TestTree]
995996
testCString =
996997
[ testProperty "cstring" $
997-
toLazyByteString (BI.cstringLiteral "hello world!"#) ==
998+
toLazyByteString (asclit (Ptr "hello world!"#)) ==
998999
LC.pack "hello" `L.append` L.singleton 0x20
9991000
`L.append` LC.pack "world!"
10001001
, testProperty "cstringUtf8" $
1001-
toLazyByteString (BI.cstringUtf8Literal "hello\xc0\x80\xc0\x80world\xc0\x80!"#) ==
1002+
toLazyByteString (utflit (Ptr "hello\xc0\x80\xc0\x80world\xc0\x80!"#)) ==
10021003
LC.pack "hello" `L.append` L.singleton 0x00
10031004
`L.append` L.singleton 0x00
10041005
`L.append` LC.pack "world"
10051006
`L.append` L.singleton 0x00
10061007
`L.append` LC.singleton '!'
10071008
]
1009+
1010+
asclit, utflit :: Ptr Word8 -> Builder
1011+
asclit str@(Ptr addr) = BI.ascLiteralCopy str (S.byteCountLiteral addr)
1012+
utflit str@(Ptr addr) = BI.modUtf8LitCopy str (S.byteCountLiteral addr)

0 commit comments

Comments
 (0)