Skip to content

Commit c0ad4c8

Browse files
clyringBodigrim
authored andcommitted
Fix several bugs around the 'byteString' family of Builders (#671)
* Fix several bugs around the 'byteString' family of Builders * Add Note [byteStringCopyStep and wrappedBytesCopyStep] This makes explicit the reasoning for in what sense "ensur[ing] that the common case is not recursive" is expected to possibly "yield[] better code."
1 parent 1f3d4cc commit c0ad4c8

File tree

4 files changed

+147
-41
lines changed

4 files changed

+147
-41
lines changed

Data/ByteString/Builder/Internal.hs

Lines changed: 71 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -138,6 +138,7 @@ import Data.Semigroup (Semigroup(..))
138138
import Data.List.NonEmpty (NonEmpty(..))
139139

140140
import qualified Data.ByteString as S
141+
import qualified Data.ByteString.Unsafe as S
141142
import qualified Data.ByteString.Internal.Type as S
142143
import qualified Data.ByteString.Lazy.Internal as L
143144
import qualified Data.ByteString.Short.Internal as Sh
@@ -819,24 +820,24 @@ ensureFree minFree =
819820
| ope `minusPtr` op < minFree = return $ bufferFull minFree op k
820821
| otherwise = k br
821822

822-
-- | Copy the bytes from a 'BufferRange' into the output stream.
823-
wrappedBytesCopyStep :: BufferRange -- ^ Input 'BufferRange'.
823+
-- | Copy the bytes from a 'S.StrictByteString' into the output stream.
824+
wrappedBytesCopyStep :: S.StrictByteString -- ^ Input 'S.StrictByteString'.
824825
-> BuildStep a -> BuildStep a
825-
wrappedBytesCopyStep (BufferRange ip0 ipe) k =
826-
go ip0
826+
-- See Note [byteStringCopyStep and wrappedBytesCopyStep]
827+
wrappedBytesCopyStep bs0 k =
828+
go bs0
827829
where
828-
go !ip (BufferRange op ope)
830+
go !bs@(S.BS ifp inpRemaining) (BufferRange op ope)
829831
| inpRemaining <= outRemaining = do
830-
copyBytes op ip inpRemaining
832+
S.unsafeWithForeignPtr ifp $ \ip -> copyBytes op ip inpRemaining
831833
let !br' = BufferRange (op `plusPtr` inpRemaining) ope
832834
k br'
833835
| otherwise = do
834-
copyBytes op ip outRemaining
835-
let !ip' = ip `plusPtr` outRemaining
836-
return $ bufferFull 1 ope (go ip')
836+
S.unsafeWithForeignPtr ifp $ \ip -> copyBytes op ip outRemaining
837+
let !bs' = S.unsafeDrop outRemaining bs
838+
return $ bufferFull 1 ope (go bs')
837839
where
838840
outRemaining = ope `minusPtr` op
839-
inpRemaining = ipe `minusPtr` ip
840841

841842

842843
-- Strict ByteStrings
@@ -857,7 +858,7 @@ byteStringThreshold :: Int -> S.StrictByteString -> Builder
857858
byteStringThreshold maxCopySize =
858859
\bs -> builder $ step bs
859860
where
860-
step bs@(S.BS _ len) !k br@(BufferRange !op _)
861+
step bs@(S.BS _ len) k br@(BufferRange !op _)
861862
| len <= maxCopySize = byteStringCopyStep bs k br
862863
| otherwise = return $ insertChunk op bs k
863864

@@ -871,21 +872,69 @@ byteStringThreshold maxCopySize =
871872
byteStringCopy :: S.StrictByteString -> Builder
872873
byteStringCopy = \bs -> builder $ byteStringCopyStep bs
873874

875+
{-
876+
Note [byteStringCopyStep and wrappedBytesCopyStep]
877+
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
878+
A Builder that copies the contents of an arbitrary ByteString needs a
879+
recursive loop, since the bytes to be copied might not fit into the
880+
first few chunk buffers provided by the driver. That loop is
881+
implemented in 'wrappedBytesCopyStep'. But we also have a
882+
non-recursive wrapper, 'byteStringCopyStep', which performs exactly
883+
the first iteration of that loop, falling back to 'wrappedBytesCopyStep'
884+
if a chunk boundary is reached before the entire ByteString is copied.
885+
886+
This is very strange! Why do we do this? Perhaps mostly for
887+
historical reasons. But sadly, changing this to use a single
888+
recursive loop regresses the benchmark 'foldMap byteStringCopy' by
889+
about 30% as of 2024, in one of two ways:
890+
891+
1. If the continuation 'k' is taken as an argument of the
892+
inner copying loop, it remains an unknown function call.
893+
So for each bytestring copied, that continuation must be
894+
entered later via a gen-apply function, which incurs dozens
895+
of cycles of extra overhead.
896+
2. If the continuation 'k' is lifted out of the inner copying
897+
loop, it becomes a free variable. And after a bit of
898+
inlining, there will be no unknown function call. But, if
899+
the continuation function has any free variables, these
900+
become free variables of the inner copying loop, which
901+
prevent the loop from floating out. (In the actual
902+
benchmark, the tail of the list of bytestrings to copy is
903+
such a free variable of the continuation.) As a result,
904+
the inner copying loop becomes a function closure object
905+
rather than a top-level function. And that means a new
906+
inner-copying-loop function-closure-object must be
907+
allocated on the heap for every bytestring copied, which
908+
is expensive.
909+
910+
In theory, GHC's late-lambda-lifting pass can clean this up by
911+
abstracting over the problematic free variables. But for some
912+
unknown reason (perhaps a bug in ghc-9.10.1) this optimization
913+
does not fire on the relevant benchmark code, even with a
914+
sufficiently high value of -fstg-lift-lams-rec-args.
915+
916+
917+
918+
Alternatively, it is possible to avoid recursion altogether by
919+
requesting that the next chunk be large enough to accommodate the
920+
entire remainder of the input when a chunk boundary is reached.
921+
But:
922+
* For very large ByteStrings, this may incur unwanted latency.
923+
* Large next-chunk-size requests have caused breakage downstream
924+
in the past. See also https://github.com/yesodweb/wai/issues/894
925+
-}
926+
874927
{-# INLINE byteStringCopyStep #-}
875928
byteStringCopyStep :: S.StrictByteString -> BuildStep a -> BuildStep a
876-
byteStringCopyStep (S.BS ifp isize) !k0 br0@(BufferRange op ope)
877-
-- Ensure that the common case is not recursive and therefore yields
878-
-- better code.
879-
| op' <= ope = do copyBytes op ip isize
880-
touchForeignPtr ifp
881-
k0 (BufferRange op' ope)
882-
| otherwise = wrappedBytesCopyStep (BufferRange ip ipe) k br0
929+
-- See Note [byteStringCopyStep and wrappedBytesCopyStep]
930+
byteStringCopyStep bs@(S.BS ifp isize) k br@(BufferRange op ope)
931+
| isize <= osize = do
932+
S.unsafeWithForeignPtr ifp $ \ip -> copyBytes op ip isize
933+
k (BufferRange op' ope)
934+
| otherwise = wrappedBytesCopyStep bs k br
883935
where
936+
osize = ope `minusPtr` op
884937
op' = op `plusPtr` isize
885-
ip = unsafeForeignPtrToPtr ifp
886-
ipe = ip `plusPtr` isize
887-
k br = do touchForeignPtr ifp -- input consumed: OK to release here
888-
k0 br
889938

890939
-- | Construct a 'Builder' that always inserts the 'S.StrictByteString'
891940
-- directly as a chunk.

tests/Properties/ByteString.hs

Lines changed: 24 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -367,7 +367,7 @@ tests =
367367
, testProperty "toChunks . fromChunks" $
368368
\xs -> B.toChunks (B.fromChunks xs) === filter (/= mempty) xs
369369
, testProperty "append lazy" $
370-
\(toElem -> c) -> B.head (B.singleton c <> undefined) === c
370+
\(toElem -> c) -> B.head (B.singleton c <> tooStrictErr) === c
371371
, testProperty "compareLength 1" $
372372
\x -> B.compareLength x (B.length x) === EQ
373373
, testProperty "compareLength 2" $
@@ -379,13 +379,13 @@ tests =
379379
, testProperty "compareLength 5" $
380380
\x (intToIndexTy -> n) -> B.compareLength x n === compare (B.length x) n
381381
, testProperty "dropEnd lazy" $
382-
\(toElem -> c) -> B.take 1 (B.dropEnd 1 (B.singleton c <> B.singleton c <> B.singleton c <> undefined)) === B.singleton c
382+
\(toElem -> c) -> B.take 1 (B.dropEnd 1 (B.singleton c <> B.singleton c <> B.singleton c <> tooStrictErr)) === B.singleton c
383383
, testProperty "dropWhileEnd lazy" $
384-
\(toElem -> c) -> B.take 1 (B.dropWhileEnd (const False) (B.singleton c <> undefined)) === B.singleton c
384+
\(toElem -> c) -> B.take 1 (B.dropWhileEnd (const False) (B.singleton c <> tooStrictErr)) === B.singleton c
385385
, testProperty "breakEnd lazy" $
386-
\(toElem -> c) -> B.take 1 (fst $ B.breakEnd (const True) (B.singleton c <> undefined)) === B.singleton c
386+
\(toElem -> c) -> B.take 1 (fst $ B.breakEnd (const True) (B.singleton c <> tooStrictErr)) === B.singleton c
387387
, testProperty "spanEnd lazy" $
388-
\(toElem -> c) -> B.take 1 (fst $ B.spanEnd (const False) (B.singleton c <> undefined)) === B.singleton c
388+
\(toElem -> c) -> B.take 1 (fst $ B.spanEnd (const False) (B.singleton c <> tooStrictErr)) === B.singleton c
389389
#endif
390390

391391
, testProperty "length" $
@@ -604,12 +604,21 @@ tests =
604604
# ifdef BYTESTRING_LAZY
605605
-- Don't use (===) in these laziness tests:
606606
-- We don't want printing the test case to fail!
607-
, testProperty "zip is lazy" $ lazyZipTest $
608-
\x y -> B.zip x y == zip (B.unpack x) (B.unpack y)
609-
, testProperty "zipWith is lazy" $ \f -> lazyZipTest $
610-
\x y -> (B.zipWith f x y :: [Int]) == zipWith f (B.unpack x) (B.unpack y)
611-
, testProperty "packZipWith is lazy" $ \f -> lazyZipTest $
612-
\x y -> B.unpack (B.packZipWith ((toElem .) . f) x y) == zipWith ((toElem .) . f) (B.unpack x) (B.unpack y)
607+
, testProperty "zip is lazy in the longer input" $ zipLazyInLongerInputTest $
608+
\x y -> B.zip x y == zip (B.unpack x) (B.unpack y)
609+
, testProperty "zipWith is lazy in the longer input" $
610+
\f -> zipLazyInLongerInputTest $
611+
\x y -> (B.zipWith f x y :: [Int]) == zipWith f (B.unpack x) (B.unpack y)
612+
, testProperty "packZipWith is lazy in the longer input" $
613+
\f -> zipLazyInLongerInputTest $
614+
\x y -> B.unpack (B.packZipWith ((toElem .) . f) x y) == zipWith ((toElem .) . f) (B.unpack x) (B.unpack y)
615+
, testProperty "zip is maximally lazy" $ \x y ->
616+
zip (B.unpack x) (B.unpack y) `List.isPrefixOf`
617+
B.zip (x <> tooStrictErr) (y <> tooStrictErr)
618+
, testProperty "zipWith is maximally lazy" $ \f x y ->
619+
zipWith f (B.unpack x) (B.unpack y) `List.isPrefixOf`
620+
B.zipWith @Int f (x <> tooStrictErr) (y <> tooStrictErr)
621+
-- (It's not clear if packZipWith is required to be maximally lazy.)
613622
# endif
614623
, testProperty "unzip" $
615624
\(fmap (toElem *** toElem) -> xs) -> (B.unpack *** B.unpack) (B.unzip xs) === unzip xs
@@ -807,15 +816,15 @@ readIntegerUnsigned xs = case readMaybe ys of
807816
#endif
808817

809818
#ifdef BYTESTRING_LAZY
810-
lazyZipTest
819+
zipLazyInLongerInputTest
811820
:: Testable prop
812821
=> (BYTESTRING_TYPE -> BYTESTRING_TYPE -> prop)
813822
-> BYTESTRING_TYPE -> BYTESTRING_TYPE -> Property
814-
lazyZipTest fun = \x0 y0 -> let
823+
zipLazyInLongerInputTest fun = \x0 y0 -> let
815824
msg = "Input chunks are: " ++ show (B.toChunks x0, B.toChunks y0)
816825
(x, y) | B.length x0 <= B.length y0
817-
= (x0, y0 <> error "too strict")
826+
= (x0, y0 <> tooStrictErr)
818827
| otherwise
819-
= (x0 <> error "too strict", y0)
828+
= (x0 <> tooStrictErr, y0)
820829
in counterexample msg (fun x y)
821830
#endif

tests/QuickCheckUtils.hs

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,7 @@ module QuickCheckUtils
77
, CByteString(..)
88
, Sqrt(..)
99
, int64OK
10+
, tooStrictErr
1011
) where
1112

1213
import Test.Tasty.QuickCheck
@@ -19,6 +20,7 @@ import Data.Int
1920
import System.IO
2021
import Foreign.C (CChar)
2122
import GHC.TypeLits (TypeError, ErrorMessage(..))
23+
import GHC.Stack (withFrozenCallStack, HasCallStack)
2224

2325
import qualified Data.ByteString.Short as SB
2426
import qualified Data.ByteString as P
@@ -134,3 +136,7 @@ instance {-# OVERLAPPING #-}
134136
-- defined in "QuickCheckUtils".
135137
int64OK :: (Arbitrary a, Show a, Testable b) => (a -> b) -> Property
136138
int64OK f = propertyForAllShrinkShow arbitrary shrink (\v -> [show v]) f
139+
140+
tooStrictErr :: forall a. HasCallStack => a
141+
tooStrictErr = withFrozenCallStack $
142+
error "A lazy sub-expression was unexpectedly evaluated"

tests/builder/Data/ByteString/Builder/Tests.hs

Lines changed: 46 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -18,7 +18,7 @@ import Control.Monad.Trans.State (StateT, evalStateT, evalState, put,
1818
import Control.Monad.Trans.Class (lift)
1919
import Control.Monad.Trans.Writer (WriterT, execWriterT, tell)
2020

21-
import Foreign (minusPtr)
21+
import Foreign (minusPtr, castPtr, ForeignPtr, withForeignPtr, Int64)
2222

2323
import Data.Char (chr)
2424
import Data.Bits ((.|.), shiftL)
@@ -40,7 +40,6 @@ import Data.ByteString.Builder.Prim.TestUtils
4040

4141
import Control.Exception (evaluate)
4242
import System.IO (openTempFile, hPutStr, hClose, hSetBinaryMode, hSetEncoding, utf8, hSetNewlineMode, noNewlineTranslation)
43-
import Foreign (ForeignPtr, withForeignPtr, castPtr)
4443
import Foreign.C.String (withCString)
4544
import Numeric (showFFloat)
4645
import System.Posix.Internals (c_unlink)
@@ -50,7 +49,8 @@ import Test.Tasty.QuickCheck
5049
( Arbitrary(..), oneof, choose, listOf, elements
5150
, counterexample, ioProperty, Property, testProperty
5251
, (===), (.&&.), conjoin, forAll, forAllShrink
53-
, UnicodeString(..), NonNegative(..)
52+
, UnicodeString(..), NonNegative(..), Positive(..)
53+
, mapSize, (==>)
5454
)
5555
import QuickCheckUtils
5656

@@ -70,7 +70,8 @@ tests =
7070
testsASCII ++
7171
testsFloating ++
7272
testsChar8 ++
73-
testsUtf8
73+
testsUtf8 ++
74+
[testLaziness]
7475

7576

7677
------------------------------------------------------------------------------
@@ -981,3 +982,44 @@ testsUtf8 =
981982
[ testBuilderConstr "charUtf8" charUtf8_list charUtf8
982983
, testBuilderConstr "stringUtf8" (foldMap charUtf8_list) stringUtf8
983984
]
985+
986+
testLaziness :: TestTree
987+
testLaziness = testGroup "Builder laziness"
988+
[ testProperty "byteString" $ mapSize (+ 10) $
989+
\bs (Positive chunkSize) ->
990+
let strategy = safeStrategy chunkSize chunkSize
991+
lbs = toLazyByteStringWith strategy L.empty
992+
(byteString bs <> tooStrictErr)
993+
in (S.length bs > max chunkSize 8) ==> L.head lbs == S.head bs
994+
, testProperty "byteStringCopy" $ mapSize (+ 10) $
995+
\bs (Positive chunkSize) ->
996+
let strategy = safeStrategy chunkSize chunkSize
997+
lbs = toLazyByteStringWith strategy L.empty
998+
(byteStringCopy bs <> tooStrictErr)
999+
in (S.length bs > max chunkSize 8) ==> L.head lbs == S.head bs
1000+
, testProperty "byteStringInsert" $ mapSize (+ 10) $
1001+
\bs (Positive chunkSize) ->
1002+
let strategy = safeStrategy chunkSize chunkSize
1003+
lbs = toLazyByteStringWith strategy L.empty
1004+
(byteStringInsert bs <> tooStrictErr)
1005+
in L.take (fromIntegral @Int @Int64 (S.length bs)) lbs
1006+
== L.fromStrict bs
1007+
, testProperty "lazyByteString" $ mapSize (+ 10) $
1008+
\bs (Positive chunkSize) ->
1009+
let strategy = safeStrategy chunkSize chunkSize
1010+
lbs = toLazyByteStringWith strategy L.empty
1011+
(lazyByteString bs <> tooStrictErr)
1012+
in (L.length bs > fromIntegral @Int @Int64 (max chunkSize 8))
1013+
==> L.head lbs == L.head bs
1014+
, testProperty "shortByteString" $ mapSize (+ 10) $
1015+
\bs (Positive chunkSize) ->
1016+
let strategy = safeStrategy chunkSize chunkSize
1017+
lbs = toLazyByteStringWith strategy L.empty
1018+
(shortByteString bs <> tooStrictErr)
1019+
in (Sh.length bs > max chunkSize 8) ==> L.head lbs == Sh.head bs
1020+
, testProperty "flush" $ \recipe -> let
1021+
!(b, toLBS) = recipeComponents recipe
1022+
!lbs1 = toLazyByteString b
1023+
!lbs2 = L.take (L.length lbs1) (toLBS $ b <> flush <> tooStrictErr)
1024+
in lbs1 == lbs2
1025+
]

0 commit comments

Comments
 (0)