Skip to content

Commit eb352a9

Browse files
vdukhovnihs-viktor
andauthored
Make Data.ByteString.Lazy.Char8.lines less strict (#562)
The current implementation of `lines` in Data.ByteString.Lazy.Char8 is too strict. When a "line" spans multiple chunks it traverses all the chunks to the first line boundary before constructing the list head. For example, `lines <$> getContents` reading a large file with no line breaks does not make the first chunk of the (only) line available until the entire file is read into memory. Co-authored-by: Viktor Dukhovni <[email protected]>
1 parent 403d920 commit eb352a9

File tree

2 files changed

+62
-54
lines changed

2 files changed

+62
-54
lines changed

Data/ByteString/Lazy/Char8.hs

Lines changed: 39 additions & 54 deletions
Original file line numberDiff line numberDiff line change
@@ -247,6 +247,7 @@ import qualified Data.ByteString.Lazy as L
247247
import qualified Data.ByteString as S (ByteString) -- typename only
248248
import qualified Data.ByteString as B
249249
import qualified Data.ByteString.Unsafe as B
250+
import Data.List.NonEmpty (NonEmpty(..))
250251
import Data.ByteString.Lazy.Internal
251252
import Data.ByteString.Lazy.ReadInt
252253
import Data.ByteString.Lazy.ReadNat
@@ -856,59 +857,50 @@ unzip :: [(Char, Char)] -> (ByteString, ByteString)
856857
unzip ls = (pack (fmap fst ls), pack (fmap snd ls))
857858
{-# INLINE unzip #-}
858859

859-
-- | 'lines' breaks a ByteString up into a list of ByteStrings at
860+
-- | 'lines' lazily splits a ByteString into a list of ByteStrings at
860861
-- newline Chars (@'\\n'@). The resulting strings do not contain newlines.
861-
--
862-
-- As of bytestring 0.9.0.3, this function is stricter than its
863-
-- list cousin.
862+
-- The first chunk of the result is only strict in the first chunk of the
863+
-- input.
864864
--
865865
-- Note that it __does not__ regard CR (@'\\r'@) as a newline character.
866866
--
867867
lines :: ByteString -> [ByteString]
868868
lines Empty = []
869-
lines (Chunk c0 cs0) = loop0 c0 cs0
870-
where
871-
-- this is a really performance sensitive function but the
872-
-- chunked representation makes the general case a bit expensive
873-
-- however assuming a large chunk size and normalish line lengths
874-
-- we will find line endings much more frequently than chunk
875-
-- endings so it makes sense to optimise for that common case.
876-
-- So we partition into two special cases depending on whether we
877-
-- are keeping back a list of chunks that will eventually be output
878-
-- once we get to the end of the current line.
879-
880-
-- the common special case where we have no existing chunks of
881-
-- the current line
882-
loop0 :: S.ByteString -> ByteString -> [ByteString]
883-
loop0 c cs =
884-
case B.elemIndex (c2w '\n') c of
885-
Nothing -> case cs of
886-
Empty | B.null c -> []
887-
| otherwise -> [Chunk c Empty]
888-
(Chunk c' cs')
889-
| B.null c -> loop0 c' cs'
890-
| otherwise -> loop c' [c] cs'
891-
892-
Just n | n /= 0 -> Chunk (B.unsafeTake n c) Empty
893-
: loop0 (B.unsafeDrop (n+1) c) cs
894-
| otherwise -> Empty
895-
: loop0 (B.unsafeTail c) cs
896-
897-
-- the general case when we are building a list of chunks that are
898-
-- part of the same line
899-
loop :: S.ByteString -> [S.ByteString] -> ByteString -> [ByteString]
900-
loop c line cs =
901-
case B.elemIndex (c2w '\n') c of
902-
Nothing ->
903-
case cs of
904-
Empty -> let !c' = revChunks (c : line)
905-
in [c']
906-
907-
(Chunk c' cs') -> loop c' (c : line) cs'
908-
909-
Just n ->
910-
let !c' = revChunks (B.unsafeTake n c : line)
911-
in c' : loop0 (B.unsafeDrop (n+1) c) cs
869+
lines (Chunk c0 cs0) = unNE $! go c0 cs0
870+
where
871+
-- Natural NonEmpty -> List
872+
unNE :: NonEmpty a -> [a]
873+
unNE (a :| b) = a : b
874+
875+
-- Strict in the first argument, lazy in the second.
876+
consNE :: ByteString -> NonEmpty ByteString -> NonEmpty ByteString
877+
consNE !a b = a :| (unNE $! b)
878+
879+
-- Note invariant: The initial chunk is non-empty on input, and we
880+
-- need to be sure to maintain this in internal recursive calls.
881+
go :: S.ByteString -> ByteString -> NonEmpty ByteString
882+
go c cs = case B.elemIndex (c2w '\n') c of
883+
Just n
884+
| n1 <- n + 1
885+
, n1 < B.length c -> consNE c' $ go (B.unsafeDrop n1 c) cs
886+
-- 'c' was a multi-line chunk
887+
| otherwise -> c' :| lines cs
888+
-- 'c' was a single-line chunk
889+
where
890+
!c' = chunk (B.unsafeTake n c) Empty
891+
892+
-- Initial chunk with no new line becomes first chunk of
893+
-- first line of result, with the rest of the result lazy!
894+
-- In particular, we don't strictly pattern match on 'cs'.
895+
--
896+
-- We can form `Chunk c ...` because the invariant is maintained
897+
-- here and also by using `chunk` in the defintion of `c'` above.
898+
Nothing -> let ~(l:|ls) = lazyRest cs
899+
in Chunk c l :| ls
900+
where
901+
lazyRest :: ByteString -> NonEmpty ByteString
902+
lazyRest (Chunk c' cs') = go c' cs'
903+
lazyRest Empty = Empty :| []
912904

913905
-- | 'unlines' joins lines, appending a terminating newline after each.
914906
--
@@ -950,10 +942,3 @@ hPutStrLn h ps = hPut h ps >> hPut h (L.singleton 0x0a)
950942
--
951943
putStrLn :: ByteString -> IO ()
952944
putStrLn = hPutStrLn stdout
953-
954-
-- ---------------------------------------------------------------------
955-
-- Internal utilities
956-
957-
-- reverse a list of possibly-empty chunks into a lazy ByteString
958-
revChunks :: [S.ByteString] -> ByteString
959-
revChunks = List.foldl' (flip chunk) Empty

tests/Properties.hs

Lines changed: 23 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -82,9 +82,29 @@ prop_unsafeTail xs = not (P.null xs) ==> P.tail xs === P.unsafeTail xs
8282
prop_unsafeLast xs = not (P.null xs) ==> P.last xs === P.unsafeLast xs
8383
prop_unsafeInit xs = not (P.null xs) ==> P.init xs === P.unsafeInit xs
8484

85+
prop_lines_empty_invariant =
86+
True === case LC.lines (LC.pack "\nfoo\n") of
87+
Empty : _ -> True
88+
_ -> False
89+
8590
prop_lines_lazy =
8691
take 2 (LC.lines (LC.append (LC.pack "a\nb\n") undefined)) === [LC.pack "a", LC.pack "b"]
8792

93+
prop_lines_lazy2 =
94+
c === case LC.lines (Chunk c undefined) of
95+
Chunk c _ : _ -> c
96+
_ -> P.empty
97+
where
98+
c = C.pack "etc..."
99+
100+
prop_lines_lazy3 =
101+
c === case LC.lines d of
102+
Chunk c _ : _ -> c
103+
_ -> P.empty
104+
where
105+
c = C.pack "etc..."
106+
d = Chunk c d
107+
88108
prop_strip x = C.strip x == (C.dropSpace . C.reverse . C.dropSpace . C.reverse) x
89109

90110
class (Bounded a, Integral a, Show a) => RdInt a where
@@ -684,6 +704,9 @@ misc_tests =
684704
, testProperty "unsafeIndex" prop_unsafeIndexBB
685705

686706
, testProperty "lines_lazy" prop_lines_lazy
707+
, testProperty "lines_lazy2" prop_lines_lazy2
708+
, testProperty "lines_lazy3" prop_lines_lazy3
709+
, testProperty "lines_invar" prop_lines_empty_invariant
687710
, testProperty "strip" prop_strip
688711
, testProperty "isSpace" prop_isSpaceWord8
689712

0 commit comments

Comments
 (0)