Skip to content

Commit 395092a

Browse files
committed
Avoid per-byte loop in cstring{,Utf8} builders
Copy chunks of the input to the output buffer with, up to the shorter of the available buffer space and the "null-free" portion of the remaining string. Actually "null-free" here means not containing any denormalised two-byte encodings starting with 0xC0 (so possibly also other ASCII bytes if the UTF-8 encoding is oddball). This substantially improves performance, with just one "15%" increase that looks like a spurious measurement error (perhaps code layout difference artefact). UTF-8 String (12B): OK 16.7 ns ± 1.3 ns, 60% less than baseline UTF-8 String (64B, one null): OK 22.6 ns ± 1.3 ns, 87% less than baseline UTF-8 String (64B, one null, no shared work): OK 30.1 ns ± 2.6 ns, 83% less than baseline UTF-8 String (64B, half nulls): OK 92.6 ns ± 5.3 ns, 49% less than baseline UTF-8 String (64B, all nulls): OK 76.3 ns ± 4.5 ns, 57% less than baseline UTF-8 String (64B, all nulls, no shared work): OK 82.3 ns ± 5.6 ns, 54% less than baseline ASCII String (12B): OK 6.50 ns ± 326 ps, 76% less than baseline ASCII String (64B): OK 8.03 ns ± 334 ps, 94% less than baseline AsciiLit: OK 8.02 ns ± 648 ps, 94% less than baseline Utf8Lit: OK 21.8 ns ± 1.3 ns, 88% less than baseline strLit: OK 8.90 ns ± 788 ps, 94% less than baseline stringUtf8: OK 22.4 ns ± 1.3 ns, 87% less than baseline strLitInline: OK 8.26 ns ± 676 ps, 94% less than baseline utf8LitInline: OK 23.2 ns ± 1.3 ns, 87% less than baseline foldMap byteStringInsert (10000): OK 46.0 μs ± 4.0 μs, 15% less than baseline --> lazyByteStringHex (10000): OK --> 4.74 μs ± 337 ns, 15% more than baseline foldMap integerDec (small) (10000): OK 205 μs ± 12 μs, 9% less than baseline char8 (10000): OK 2.58 μs ± 234 ns, 30% less than baseline foldMap (left-assoc) (10000): OK 73.2 μs ± 2.9 μs, 54% less than baseline foldMap (right-assoc) (10000): OK 43.0 μs ± 4.2 μs, 65% less than baseline foldMap [manually fused, left-assoc] (10000): OK 81.4 μs ± 5.3 μs, 48% less than baseline foldMap [manually fused, right-assoc] (10000): OK 47.3 μs ± 785 ns, 61% less than baseline
1 parent 72b1552 commit 395092a

File tree

5 files changed

+151
-53
lines changed

5 files changed

+151
-53
lines changed

Data/ByteString/Builder/Internal.hs

Lines changed: 107 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,8 @@
11
{-# LANGUAGE Unsafe #-}
22
{-# LANGUAGE TypeFamilies #-}
3+
{-# LANGUAGE TypeApplications #-}
34
{-# LANGUAGE NoMonoLocalBinds #-}
5+
{-# LANGUAGE ViewPatterns #-}
46

57
{-# OPTIONS_HADDOCK not-home #-}
68

@@ -87,6 +89,8 @@ module Data.ByteString.Builder.Internal (
8789
-- , sizedChunksInsert
8890

8991
, byteStringCopy
92+
, asciiLiteralCopy
93+
, modUtf8LitCopy
9094
, byteStringInsert
9195
, byteStringThreshold
9296

@@ -816,6 +820,7 @@ ensureFree :: Int -> Builder
816820
ensureFree minFree =
817821
builder step
818822
where
823+
step :: forall r. BuildStep r -> BuildStep r
819824
step k br@(BufferRange op ope)
820825
| ope `minusPtr` op < minFree = return $ bufferFull minFree op k
821826
| otherwise = k br
@@ -839,6 +844,25 @@ wrappedBytesCopyStep bs0 k =
839844
where
840845
outRemaining = ope `minusPtr` op
841846

847+
-- | Copy the bytes from a 'BufferRange' into the output stream.
848+
wrappedBufferRangeCopyStep :: BufferRange -- ^ Input 'BufferRange'.
849+
-> BuildStep a -> BuildStep a
850+
wrappedBufferRangeCopyStep (BufferRange ip0 ipe) k =
851+
go ip0
852+
where
853+
go !ip (BufferRange op ope)
854+
| inpRemaining <= outRemaining = do
855+
copyBytes op ip inpRemaining
856+
let !br' = BufferRange (op `plusPtr` inpRemaining) ope
857+
k br'
858+
| otherwise = do
859+
copyBytes op ip outRemaining
860+
let !ip' = ip `plusPtr` outRemaining
861+
return $ bufferFull 1 ope (go ip')
862+
where
863+
outRemaining = ope `minusPtr` op
864+
inpRemaining = ipe `minusPtr` ip
865+
842866

843867
-- Strict ByteStrings
844868
------------------------------------------------------------------------------
@@ -858,6 +882,7 @@ byteStringThreshold :: Int -> S.StrictByteString -> Builder
858882
byteStringThreshold maxCopySize =
859883
\bs -> builder $ step bs
860884
where
885+
step :: forall r. S.ByteString -> BuildStep r -> BuildStep r
861886
step bs@(S.BS _ len) k br@(BufferRange !op _)
862887
| len <= maxCopySize = byteStringCopyStep bs k br
863888
| otherwise = return $ insertChunk op bs k
@@ -949,6 +974,88 @@ byteStringInsert :: S.StrictByteString -> Builder
949974
byteStringInsert =
950975
\bs -> builder $ \k (BufferRange op _) -> return $ insertChunk op bs k
951976

977+
978+
------------------------------------------------------------------------------
979+
-- Raw CString encoding
980+
------------------------------------------------------------------------------
981+
982+
-- | Builder for raw pointers to static data of known length that will never be
983+
-- moved or freed. (This is used with the static buffers GHC uses to implement
984+
-- ASCII string literals that do not contain null characters.)
985+
--
986+
-- @since 0.13.0.0
987+
{-# INLINABLE asciiLiteralCopy #-}
988+
asciiLiteralCopy :: Ptr Word8 -> Int -> Builder
989+
asciiLiteralCopy = \ !ip !len -> builder $ \k br@(BufferRange op ope) ->
990+
if len <= ope `minusPtr` op
991+
then copyBytes op ip len >> k (BufferRange (op `plusPtr` len) ope)
992+
else wrappedBufferRangeCopyStep (BufferRange ip (ip `plusPtr` len)) k br
993+
994+
-- | Builder for pointers to /null-terminated/ primitive UTF-8 encoded strings
995+
-- that may contain embedded overlong two-byte encodings of the NUL character
996+
-- as @0xC0 0x80@. Other deviations from strict UTF-8 are tolerated, but the
997+
-- result is not well defined.
998+
--
999+
-- @since 0.13.0.0
1000+
{-# INLINABLE modUtf8LitCopy #-}
1001+
modUtf8LitCopy :: Ptr Word8 -> Int -> Builder
1002+
modUtf8LitCopy !ip !len
1003+
| len > 0 = builder (modUtf8_step ip len)
1004+
| otherwise = builder id
1005+
1006+
-- | Copy a /non-empty/ UTF-8 input possibly containing denormalised 2-octet
1007+
-- sequences. While only the NUL byte should ever encoded that way (as @0xC0
1008+
-- 80@), this handles other denormalised @0xC0 0x??@ sequences by keeping the
1009+
-- bottom 6 bits of the second byte. If the input is non-UTF8 garbage, the the
1010+
-- result may not be what the user expected.
1011+
--
1012+
modUtf8_step :: Ptr Word8 -> Int -> BuildStep r -> BuildStep r
1013+
modUtf8_step !ip !len k (BufferRange op ope)
1014+
| op == ope = return $ bufferFull 1 op (modUtf8_step ip len k)
1015+
| otherwise = do
1016+
let !avail = ope `minusPtr` op
1017+
!usable = avail `min` len
1018+
-- null-termination makes it possible to read one more byte than the
1019+
-- nominal input length, with any unexpected 0xC000 ending interpreted
1020+
-- as a NUL. More typically, this simplifies hanlding of inputs where
1021+
-- 0xC0 0x80 might otherwise be split across the "usable" input window.
1022+
!ch <- peekElemOff ip (usable - 1)
1023+
let !use | ch /= 0xC0 = usable
1024+
| otherwise = usable + 1
1025+
!n <- utf8_copyBytes (ip `plusPtr` use) ip op
1026+
let !op' = op `plusPtr` n
1027+
!len' = len - use
1028+
ip' = ip `plusPtr` use
1029+
if | len' <= 0 -> k (BufferRange op' ope)
1030+
| op' < ope -> modUtf8_step ip' len' k (BufferRange op' ope)
1031+
| otherwise -> return $ bufferFull 1 op' (modUtf8_step ip' len' k)
1032+
1033+
-- | Consume the supplied input returning the number of bytes written
1034+
utf8_copyBytes :: Ptr Word8 -> Ptr Word8 -> Ptr Word8 -> IO Int
1035+
utf8_copyBytes !ipe = \ ip op -> go 0 ip op
1036+
where
1037+
go :: Int -> Ptr Word8 -> Ptr Word8 -> IO Int
1038+
go !n !ip@((< ipe) -> True) !op = do
1039+
!ch <- peek ip
1040+
let !ip' = ip `plusPtr` 1
1041+
!op' = op `plusPtr` 1
1042+
if | ch /= 0xC0 -> do
1043+
poke op ch
1044+
let !cnt = ipe `minusPtr` ip'
1045+
!runend <- S.memchr ip' 0xC0 (fromIntegral @Int cnt)
1046+
let !runlen | runend == nullPtr = cnt
1047+
| otherwise = runend `minusPtr` ip'
1048+
if (runlen == 0)
1049+
then go (n + 1) ip' op'
1050+
else do
1051+
copyBytes op' ip' runlen
1052+
go (n + 1 + runlen) (ip' `plusPtr` runlen) (op' `plusPtr` runlen)
1053+
| otherwise -> do
1054+
!ch' <- peek ip'
1055+
poke op (ch' .&. 0x3f)
1056+
go (n + 1) (ip' `plusPtr` 1) op'
1057+
go !n _ _ = pure n
1058+
9521059
-- Short bytestrings
9531060
------------------------------------------------------------------------------
9541061

Data/ByteString/Builder/Prim.hs

Lines changed: 10 additions & 48 deletions
Original file line numberDiff line numberDiff line change
@@ -453,6 +453,7 @@ import Data.ByteString.Builder.Internal
453453

454454
import qualified Data.ByteString as S
455455
import qualified Data.ByteString.Internal as S
456+
import qualified Data.ByteString.Internal.Type as S
456457
import qualified Data.ByteString.Lazy.Internal as L
457458

458459
import Data.Char (ord)
@@ -464,9 +465,7 @@ import Data.ByteString.Builder.Prim.ASCII
464465

465466
import Foreign
466467
import Foreign.ForeignPtr.Unsafe (unsafeForeignPtrToPtr)
467-
import GHC.Word (Word8 (..))
468468
import GHC.Exts
469-
import GHC.IO
470469

471470
------------------------------------------------------------------------------
472471
-- Creating Builders from bounded primitives
@@ -658,59 +657,22 @@ primMapLazyByteStringBounded w =
658657
L.foldrChunks (\x b -> primMapByteStringBounded w x `mappend` b) mempty
659658

660659

661-
------------------------------------------------------------------------------
662-
-- Raw CString encoding
663-
------------------------------------------------------------------------------
664-
665-
-- | A null-terminated ASCII encoded 'Foreign.C.String.CString'.
666-
-- Null characters are not representable.
660+
-- | Builder for raw 'Addr#' pointers to null-terminated primitive ASCII
661+
-- strings that are free of embedded null characters.
667662
--
668663
-- @since 0.11.0.0
669664
cstring :: Addr# -> Builder
670-
cstring =
671-
\addr0 -> builder $ step addr0
672-
where
673-
step :: Addr# -> BuildStep r -> BuildStep r
674-
step !addr !k br@(BufferRange op0@(Ptr op0#) ope)
675-
| W8# ch == 0 = k br
676-
| op0 == ope =
677-
return $ bufferFull 1 op0 (step addr k)
678-
| otherwise = do
679-
IO $ \s -> case writeWord8OffAddr# op0# 0# ch s of
680-
s' -> (# s', () #)
681-
let br' = BufferRange (op0 `plusPtr` 1) ope
682-
step (addr `plusAddr#` 1#) k br'
683-
where
684-
!ch = indexWord8OffAddr# addr 0#
665+
cstring s = asciiLiteralCopy (Ptr s) (S.byteCountLiteral s)
666+
{-# INLINE cstring #-}
685667

686-
-- | A null-terminated UTF-8 encoded 'Foreign.C.String.CString'.
687-
-- Null characters can be encoded as @0xc0 0x80@.
668+
-- | Builder for raw 'Addr#' pointers to null-terminated primitive UTF-8
669+
-- encoded strings in which any emebded null characters are represented via
670+
-- the two-byte overlong-encoding: @0xC0 0x80@.
688671
--
689672
-- @since 0.11.0.0
690673
cstringUtf8 :: Addr# -> Builder
691-
cstringUtf8 =
692-
\addr0 -> builder $ step addr0
693-
where
694-
step :: Addr# -> BuildStep r -> BuildStep r
695-
step !addr !k br@(BufferRange op0@(Ptr op0#) ope)
696-
| W8# ch == 0 = k br
697-
| op0 == ope =
698-
return $ bufferFull 1 op0 (step addr k)
699-
-- NULL is encoded as 0xc0 0x80
700-
| W8# ch == 0xc0
701-
, W8# (indexWord8OffAddr# addr 1#) == 0x80 = do
702-
let !(W8# nullByte#) = 0
703-
IO $ \s -> case writeWord8OffAddr# op0# 0# nullByte# s of
704-
s' -> (# s', () #)
705-
let br' = BufferRange (op0 `plusPtr` 1) ope
706-
step (addr `plusAddr#` 2#) k br'
707-
| otherwise = do
708-
IO $ \s -> case writeWord8OffAddr# op0# 0# ch s of
709-
s' -> (# s', () #)
710-
let br' = BufferRange (op0 `plusPtr` 1) ope
711-
step (addr `plusAddr#` 1#) k br'
712-
where
713-
!ch = indexWord8OffAddr# addr 0#
674+
cstringUtf8 s = modUtf8LitCopy (Ptr s) (S.byteCountLiteral s)
675+
{-# INLINE cstringUtf8 #-}
714676

715677
------------------------------------------------------------------------------
716678
-- Char8 encoding

Data/ByteString/Internal/Type.hs

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -42,6 +42,7 @@ module Data.ByteString.Internal.Type (
4242
unpackChars, unpackAppendCharsLazy, unpackAppendCharsStrict,
4343
unsafePackAddress, unsafePackLenAddress,
4444
unsafePackLiteral, unsafePackLenLiteral,
45+
byteCountLiteral,
4546

4647
-- * Low level imperative construction
4748
empty,
@@ -475,6 +476,18 @@ unsafePackLenAddress len addr# = do
475476
#endif
476477
{-# INLINE unsafePackLenAddress #-}
477478

479+
-- | Byte count of null-terminated primitive literal string excluding the
480+
-- terminating null byte.
481+
byteCountLiteral :: Addr# -> Int
482+
byteCountLiteral addr# =
483+
#if HS_cstringLength_AND_FinalPtr_AVAILABLE
484+
I# (cstringLength# addr#)
485+
#else
486+
fromIntegral @CSize @Int $
487+
accursedUnutterablePerformIO (c_strlen (Ptr addr#))
488+
#endif
489+
{-# INLINE byteCountLiteral #-}
490+
478491
-- | See 'unsafePackAddress'. This function has similar behavior. Prefer
479492
-- this function when the address in known to be an @Addr#@ literal. In
480493
-- that context, there is no need for the sequencing guarantees that 'IO'

bench/BenchAll.hs

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -327,6 +327,10 @@ main = do
327327
, benchB'_ "ASCII String (12B)" $ asciiLit (Ptr "hello wurld!"#)
328328
, benchB' "ASCII String (64B, naive)" asciiStr fromString
329329
, benchB'_ "ASCII String (64B)" $ asciiLit asciiBuf
330+
, benchB'_ "strLit" $ string8 asciiStr
331+
, benchB'_ "stringUtf8" $ stringUtf8 utf8Str
332+
, benchB'_ "strLitInline" $ string8 "XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX"
333+
, benchB'_ "utf8LitInline" $ stringUtf8 "XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX\0XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX"
330334
]
331335

332336
, bgroup "Encoding wrappers"

tests/builder/Data/ByteString/Builder/Prim/Tests.hs

Lines changed: 17 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,7 @@ import Data.Char (ord)
1414
import qualified Data.ByteString.Lazy as L
1515
import qualified Data.ByteString.Lazy.Char8 as LC
1616
import Data.ByteString.Builder
17+
import Data.ByteString.Builder.Extra as BE
1718
import qualified Data.ByteString.Builder.Prim as BP
1819
import Data.ByteString.Builder.Prim.TestUtils
1920

@@ -22,17 +23,28 @@ import Test.Tasty.QuickCheck
2223

2324
tests :: [TestTree]
2425
tests = concat [ testsBinary, testsASCII, testsChar8, testsUtf8
25-
, testsCombinatorsB, [testCString, testCStringUtf8] ]
26+
, testsCombinatorsB
27+
, [ testCString
28+
, testCStringUtf8 1
29+
, testCStringUtf8 6
30+
, testCStringUtf8 64
31+
]
32+
]
2633

2734
testCString :: TestTree
2835
testCString = testProperty "cstring" $
2936
toLazyByteString (BP.cstring "hello world!"#) ==
3037
LC.pack "hello" `L.append` L.singleton 0x20 `L.append` LC.pack "world!"
3138

32-
testCStringUtf8 :: TestTree
33-
testCStringUtf8 = testProperty "cstringUtf8" $
34-
toLazyByteString (BP.cstringUtf8 "hello\xc0\x80world!"#) ==
35-
LC.pack "hello" `L.append` L.singleton 0x00 `L.append` LC.pack "world!"
39+
testCStringUtf8 :: Int -> TestTree
40+
testCStringUtf8 sz = testProperty "cstringUtf8" $
41+
BE.toLazyByteStringWith (BE.untrimmedStrategy sz sz) L.empty
42+
(BP.cstringUtf8 "hello\xc0\x80\xc0\x80\xd0\xbc\xd0\xb8\xd1\x80\xc0\x80\xC0"#) ==
43+
LC.pack "hello" `L.append` L.singleton 0x00
44+
`L.append` L.singleton 0x00
45+
`L.append` LC.pack "\xd0\xbc\xd0\xb8\xd1\x80"
46+
`L.append` L.singleton 0x00
47+
`L.append` L.singleton 0x00
3648

3749
------------------------------------------------------------------------------
3850
-- Binary

0 commit comments

Comments
 (0)