Make Data.ByteString.Lazy.Char8.lines less strict (#562)

vdukhovni · hs-viktor · web-flow · commit eb352a91d9a9 · 2022-12-08T21:21:01.000Z
The current implementation of `lines` in Data.ByteString.Lazy.Char8 is too
strict.  When a "line" spans multiple chunks it traverses all the chunks
to the first line boundary before constructing the list head.

For example, `lines &lt;$&gt; getContents` reading a large file with no line breaks
does not make the first chunk of the (only) line available until the entire
file is read into memory.

Co-authored-by: Viktor Dukhovni &lt;ietf-dane@dukhovni.org&gt;
diff --git a/Data/ByteString/Lazy/Char8.hs b/Data/ByteString/Lazy/Char8.hs
@@ -247,6 +247,7 @@ import qualified Data.ByteString.Lazy as L
 import qualified Data.ByteString as S (ByteString) -- typename only
 import qualified Data.ByteString as B
 import qualified Data.ByteString.Unsafe as B
+import Data.List.NonEmpty (NonEmpty(..))
 import Data.ByteString.Lazy.Internal
 import Data.ByteString.Lazy.ReadInt
 import Data.ByteString.Lazy.ReadNat
@@ -856,59 +857,50 @@ unzip :: [(Char, Char)] -> (ByteString, ByteString)
 unzip ls = (pack (fmap fst ls), pack (fmap snd ls))
 {-# INLINE unzip #-}
 
--- | 'lines' breaks a ByteString up into a list of ByteStrings at
+-- | 'lines' lazily splits a ByteString into a list of ByteStrings at
 -- newline Chars (@'\\n'@). The resulting strings do not contain newlines.
---
--- As of bytestring 0.9.0.3, this function is stricter than its
--- list cousin.
+-- The first chunk of the result is only strict in the first chunk of the
+-- input.
 --
 -- Note that it __does not__ regard CR (@'\\r'@) as a newline character.
 --
 lines :: ByteString -> [ByteString]
 lines Empty          = []
-lines (Chunk c0 cs0) = loop0 c0 cs0
-    where
-    -- this is a really performance sensitive function but the
-    -- chunked representation makes the general case a bit expensive
-    -- however assuming a large chunk size and normalish line lengths
-    -- we will find line endings much more frequently than chunk
-    -- endings so it makes sense to optimise for that common case.
-    -- So we partition into two special cases depending on whether we
-    -- are keeping back a list of chunks that will eventually be output
-    -- once we get to the end of the current line.
-
-    -- the common special case where we have no existing chunks of
-    -- the current line
-    loop0 :: S.ByteString -> ByteString -> [ByteString]
-    loop0 c cs =
-        case B.elemIndex (c2w '\n') c of
-            Nothing -> case cs of
-                           Empty  | B.null c  -> []
-                                  | otherwise -> [Chunk c Empty]
-                           (Chunk c' cs')
-                               | B.null c  -> loop0 c'     cs'
-                               | otherwise -> loop  c' [c] cs'
-
-            Just n | n /= 0    -> Chunk (B.unsafeTake n c) Empty
-                                : loop0 (B.unsafeDrop (n+1) c) cs
-                   | otherwise -> Empty
-                                : loop0 (B.unsafeTail c) cs
-
-    -- the general case when we are building a list of chunks that are
-    -- part of the same line
-    loop :: S.ByteString -> [S.ByteString] -> ByteString -> [ByteString]
-    loop c line cs =
-        case B.elemIndex (c2w '\n') c of
-            Nothing ->
-                case cs of
-                    Empty -> let !c' = revChunks (c : line)
-                              in [c']
-
-                    (Chunk c' cs') -> loop c' (c : line) cs'
-
-            Just n ->
-                let !c' = revChunks (B.unsafeTake n c : line)
-                 in c' : loop0 (B.unsafeDrop (n+1) c) cs
+lines (Chunk c0 cs0) = unNE $! go c0 cs0
+  where
+    -- Natural NonEmpty -> List
+    unNE :: NonEmpty a -> [a]
+    unNE (a :| b) = a : b
+
+    -- Strict in the first argument, lazy in the second.
+    consNE :: ByteString -> NonEmpty ByteString -> NonEmpty ByteString
+    consNE !a b = a :| (unNE $! b)
+
+    -- Note invariant: The initial chunk is non-empty on input, and we
+    -- need to be sure to maintain this in internal recursive calls.
+    go :: S.ByteString -> ByteString -> NonEmpty ByteString
+    go c cs = case B.elemIndex (c2w '\n') c of
+        Just n
+            | n1 <- n + 1
+            , n1 < B.length c -> consNE c' $ go (B.unsafeDrop n1 c) cs
+              -- 'c' was a multi-line chunk
+            | otherwise       -> c' :| lines cs
+              -- 'c' was a single-line chunk
+          where
+            !c' = chunk (B.unsafeTake n c) Empty
+
+        -- Initial chunk with no new line becomes first chunk of
+        -- first line of result, with the rest of the result lazy!
+        -- In particular, we don't strictly pattern match on 'cs'.
+        --
+        -- We can form `Chunk c ...` because the invariant is maintained
+        -- here and also by using `chunk` in the defintion of `c'` above.
+        Nothing -> let ~(l:|ls) = lazyRest cs
+                    in  Chunk c l :| ls
+          where
+            lazyRest :: ByteString -> NonEmpty ByteString
+            lazyRest (Chunk c' cs') = go c' cs'
+            lazyRest Empty          = Empty :| []
 
 -- | 'unlines' joins lines, appending a terminating newline after each.
 --
@@ -950,10 +942,3 @@ hPutStrLn h ps = hPut h ps >> hPut h (L.singleton 0x0a)
 --
 putStrLn :: ByteString -> IO ()
 putStrLn = hPutStrLn stdout
-
--- ---------------------------------------------------------------------
--- Internal utilities
-
--- reverse a list of possibly-empty chunks into a lazy ByteString
-revChunks :: [S.ByteString] -> ByteString
-revChunks = List.foldl' (flip chunk) Empty
diff --git a/tests/Properties.hs b/tests/Properties.hs
@@ -82,9 +82,29 @@ prop_unsafeTail xs = not (P.null xs) ==> P.tail xs === P.unsafeTail xs
 prop_unsafeLast xs = not (P.null xs) ==> P.last xs === P.unsafeLast xs
 prop_unsafeInit xs = not (P.null xs) ==> P.init xs === P.unsafeInit xs
 
+prop_lines_empty_invariant =
+     True === case LC.lines (LC.pack "\nfoo\n") of
+        Empty : _ -> True
+        _         -> False
+
 prop_lines_lazy =
     take 2 (LC.lines (LC.append (LC.pack "a\nb\n") undefined)) === [LC.pack "a", LC.pack "b"]
 
+prop_lines_lazy2 =
+     c === case LC.lines (Chunk c undefined) of
+        Chunk c _ : _ -> c
+        _             -> P.empty
+  where
+    c = C.pack "etc..."
+
+prop_lines_lazy3 =
+     c === case LC.lines d of
+        Chunk c _ : _ -> c
+        _             -> P.empty
+  where
+    c = C.pack "etc..."
+    d = Chunk c d
+
 prop_strip x = C.strip x == (C.dropSpace . C.reverse . C.dropSpace . C.reverse) x
 
 class (Bounded a, Integral a, Show a) => RdInt a where
@@ -684,6 +704,9 @@ misc_tests =
     , testProperty "unsafeIndex"    prop_unsafeIndexBB
 
     , testProperty "lines_lazy"     prop_lines_lazy
+    , testProperty "lines_lazy2"    prop_lines_lazy2
+    , testProperty "lines_lazy3"    prop_lines_lazy3
+    , testProperty "lines_invar"    prop_lines_empty_invariant
     , testProperty "strip"          prop_strip
     , testProperty "isSpace"        prop_isSpaceWord8