@@ -138,6 +138,7 @@ import Data.Semigroup (Semigroup(..))
138138import Data.List.NonEmpty (NonEmpty (.. ))
139139
140140import qualified Data.ByteString as S
141+ import qualified Data.ByteString.Unsafe as S
141142import qualified Data.ByteString.Internal.Type as S
142143import qualified Data.ByteString.Lazy.Internal as L
143144import qualified Data.ByteString.Short.Internal as Sh
@@ -819,24 +820,24 @@ ensureFree minFree =
819820 | ope `minusPtr` op < minFree = return $ bufferFull minFree op k
820821 | otherwise = k br
821822
822- -- | Copy the bytes from a 'BufferRange ' into the output stream.
823- wrappedBytesCopyStep :: BufferRange -- ^ Input 'BufferRange '.
823+ -- | Copy the bytes from a 'S.StrictByteString ' into the output stream.
824+ wrappedBytesCopyStep :: S. StrictByteString -- ^ Input 'S.StrictByteString '.
824825 -> BuildStep a -> BuildStep a
825- wrappedBytesCopyStep (BufferRange ip0 ipe) k =
826- go ip0
826+ -- See Note [byteStringCopyStep and wrappedBytesCopyStep]
827+ wrappedBytesCopyStep bs0 k =
828+ go bs0
827829 where
828- go ! ip (BufferRange op ope)
830+ go ! bs @ ( S. BS ifp inpRemaining) (BufferRange op ope)
829831 | inpRemaining <= outRemaining = do
830- copyBytes op ip inpRemaining
832+ S. unsafeWithForeignPtr ifp $ \ ip -> copyBytes op ip inpRemaining
831833 let ! br' = BufferRange (op `plusPtr` inpRemaining) ope
832834 k br'
833835 | otherwise = do
834- copyBytes op ip outRemaining
835- let ! ip ' = ip `plusPtr` outRemaining
836- return $ bufferFull 1 ope (go ip ')
836+ S. unsafeWithForeignPtr ifp $ \ ip -> copyBytes op ip outRemaining
837+ let ! bs ' = S. unsafeDrop outRemaining bs
838+ return $ bufferFull 1 ope (go bs ')
837839 where
838840 outRemaining = ope `minusPtr` op
839- inpRemaining = ipe `minusPtr` ip
840841
841842
842843-- Strict ByteStrings
@@ -857,7 +858,7 @@ byteStringThreshold :: Int -> S.StrictByteString -> Builder
857858byteStringThreshold maxCopySize =
858859 \ bs -> builder $ step bs
859860 where
860- step bs@ (S. BS _ len) ! k br@ (BufferRange ! op _)
861+ step bs@ (S. BS _ len) k br@ (BufferRange ! op _)
861862 | len <= maxCopySize = byteStringCopyStep bs k br
862863 | otherwise = return $ insertChunk op bs k
863864
@@ -871,21 +872,69 @@ byteStringThreshold maxCopySize =
871872byteStringCopy :: S. StrictByteString -> Builder
872873byteStringCopy = \ bs -> builder $ byteStringCopyStep bs
873874
875+ {-
876+ Note [byteStringCopyStep and wrappedBytesCopyStep]
877+ ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
878+ A Builder that copies the contents of an arbitrary ByteString needs a
879+ recursive loop, since the bytes to be copied might not fit into the
880+ first few chunk buffers provided by the driver. That loop is
881+ implemented in 'wrappedBytesCopyStep'. But we also have a
882+ non-recursive wrapper, 'byteStringCopyStep', which performs exactly
883+ the first iteration of that loop, falling back to 'wrappedBytesCopyStep'
884+ if a chunk boundary is reached before the entire ByteString is copied.
885+
886+ This is very strange! Why do we do this? Perhaps mostly for
887+ historical reasons. But sadly, changing this to use a single
888+ recursive loop regresses the benchmark 'foldMap byteStringCopy' by
889+ about 30% as of 2024, in one of two ways:
890+
891+ 1. If the continuation 'k' is taken as an argument of the
892+ inner copying loop, it remains an unknown function call.
893+ So for each bytestring copied, that continuation must be
894+ entered later via a gen-apply function, which incurs dozens
895+ of cycles of extra overhead.
896+ 2. If the continuation 'k' is lifted out of the inner copying
897+ loop, it becomes a free variable. And after a bit of
898+ inlining, there will be no unknown function call. But, if
899+ the continuation function has any free variables, these
900+ become free variables of the inner copying loop, which
901+ prevent the loop from floating out. (In the actual
902+ benchmark, the tail of the list of bytestrings to copy is
903+ such a free variable of the continuation.) As a result,
904+ the inner copying loop becomes a function closure object
905+ rather than a top-level function. And that means a new
906+ inner-copying-loop function-closure-object must be
907+ allocated on the heap for every bytestring copied, which
908+ is expensive.
909+
910+ In theory, GHC's late-lambda-lifting pass can clean this up by
911+ abstracting over the problematic free variables. But for some
912+ unknown reason (perhaps a bug in ghc-9.10.1) this optimization
913+ does not fire on the relevant benchmark code, even with a
914+ sufficiently high value of -fstg-lift-lams-rec-args.
915+
916+
917+
918+ Alternatively, it is possible to avoid recursion altogether by
919+ requesting that the next chunk be large enough to accommodate the
920+ entire remainder of the input when a chunk boundary is reached.
921+ But:
922+ * For very large ByteStrings, this may incur unwanted latency.
923+ * Large next-chunk-size requests have caused breakage downstream
924+ in the past. See also https://github.com/yesodweb/wai/issues/894
925+ -}
926+
874927{-# INLINE byteStringCopyStep #-}
875928byteStringCopyStep :: S. StrictByteString -> BuildStep a -> BuildStep a
876- byteStringCopyStep (S. BS ifp isize) ! k0 br0@ (BufferRange op ope)
877- -- Ensure that the common case is not recursive and therefore yields
878- -- better code.
879- | op' <= ope = do copyBytes op ip isize
880- touchForeignPtr ifp
881- k0 (BufferRange op' ope)
882- | otherwise = wrappedBytesCopyStep (BufferRange ip ipe) k br0
929+ -- See Note [byteStringCopyStep and wrappedBytesCopyStep]
930+ byteStringCopyStep bs@ (S. BS ifp isize) k br@ (BufferRange op ope)
931+ | isize <= osize = do
932+ S. unsafeWithForeignPtr ifp $ \ ip -> copyBytes op ip isize
933+ k (BufferRange op' ope)
934+ | otherwise = wrappedBytesCopyStep bs k br
883935 where
936+ osize = ope `minusPtr` op
884937 op' = op `plusPtr` isize
885- ip = unsafeForeignPtrToPtr ifp
886- ipe = ip `plusPtr` isize
887- k br = do touchForeignPtr ifp -- input consumed: OK to release here
888- k0 br
889938
890939-- | Construct a 'Builder' that always inserts the 'S.StrictByteString'
891940-- directly as a chunk.
0 commit comments