Avoid per-byte loop in cstring{,Utf8} builders

hs-viktor · hs-viktor · commit 395092a1a0a3 · 2025-10-13T22:31:51.000+11:00
Copy chunks of the input to the output buffer with, up to the shorter
of the available buffer space and the "null-free" portion of the remaining
string.  Actually "null-free" here means not containing any denormalised
two-byte encodings starting with 0xC0 (so possibly also other ASCII
bytes if the UTF-8 encoding is oddball).

This substantially improves performance, with just one "15%" increase
that looks like a spurious measurement error (perhaps code layout
difference artefact).

      UTF-8 String (12B):                            OK
        16.7 ns ± 1.3 ns, 60% less than baseline
      UTF-8 String (64B, one null):                  OK
        22.6 ns ± 1.3 ns, 87% less than baseline
      UTF-8 String (64B, one null, no shared work):  OK
        30.1 ns ± 2.6 ns, 83% less than baseline
      UTF-8 String (64B, half nulls):                OK
        92.6 ns ± 5.3 ns, 49% less than baseline
      UTF-8 String (64B, all nulls):                 OK
        76.3 ns ± 4.5 ns, 57% less than baseline
      UTF-8 String (64B, all nulls, no shared work): OK
        82.3 ns ± 5.6 ns, 54% less than baseline
      ASCII String (12B):                            OK
        6.50 ns ± 326 ps, 76% less than baseline
      ASCII String (64B):                            OK
        8.03 ns ± 334 ps, 94% less than baseline
      AsciiLit:                                      OK
        8.02 ns ± 648 ps, 94% less than baseline
      Utf8Lit:                                       OK
        21.8 ns ± 1.3 ns, 88% less than baseline
      strLit:                                        OK
        8.90 ns ± 788 ps, 94% less than baseline
      stringUtf8:                                    OK
        22.4 ns ± 1.3 ns, 87% less than baseline
      strLitInline:                                  OK
        8.26 ns ± 676 ps, 94% less than baseline
      utf8LitInline:                                 OK
        23.2 ns ± 1.3 ns, 87% less than baseline
      foldMap byteStringInsert (10000):              OK
        46.0 μs ± 4.0 μs, 15% less than baseline
--&gt;   lazyByteStringHex (10000):                     OK
--&gt;     4.74 μs ± 337 ns, 15% more than baseline
      foldMap integerDec (small) (10000):            OK
        205  μs ±  12 μs,  9% less than baseline
    char8 (10000):                                   OK
      2.58 μs ± 234 ns, 30% less than baseline
      foldMap (left-assoc) (10000):                  OK
        73.2 μs ± 2.9 μs, 54% less than baseline
      foldMap (right-assoc) (10000):                 OK
        43.0 μs ± 4.2 μs, 65% less than baseline
      foldMap [manually fused, left-assoc] (10000):  OK
        81.4 μs ± 5.3 μs, 48% less than baseline
      foldMap [manually fused, right-assoc] (10000): OK
        47.3 μs ± 785 ns, 61% less than baseline
diff --git a/Data/ByteString/Builder/Internal.hs b/Data/ByteString/Builder/Internal.hs
@@ -1,6 +1,8 @@
 {-# LANGUAGE Unsafe #-}
 {-# LANGUAGE TypeFamilies #-}
+{-# LANGUAGE TypeApplications #-}
 {-# LANGUAGE NoMonoLocalBinds #-}
+{-# LANGUAGE ViewPatterns #-}
 
 {-# OPTIONS_HADDOCK not-home #-}
 
@@ -87,6 +89,8 @@ module Data.ByteString.Builder.Internal (
   -- , sizedChunksInsert
 
   , byteStringCopy
+  , asciiLiteralCopy
+  , modUtf8LitCopy
   , byteStringInsert
   , byteStringThreshold
 
@@ -816,6 +820,7 @@ ensureFree :: Int -> Builder
 ensureFree minFree =
     builder step
   where
+    step :: forall r. BuildStep r -> BuildStep r
     step k br@(BufferRange op ope)
       | ope `minusPtr` op < minFree = return $ bufferFull minFree op k
       | otherwise                   = k br
@@ -839,6 +844,25 @@ wrappedBytesCopyStep bs0 k =
       where
         outRemaining = ope `minusPtr` op
 
+-- | Copy the bytes from a 'BufferRange' into the output stream.
+wrappedBufferRangeCopyStep :: BufferRange  -- ^ Input 'BufferRange'.
+                           -> BuildStep a -> BuildStep a
+wrappedBufferRangeCopyStep (BufferRange ip0 ipe) k =
+    go ip0
+  where
+    go !ip (BufferRange op ope)
+      | inpRemaining <= outRemaining = do
+          copyBytes op ip inpRemaining
+          let !br' = BufferRange (op `plusPtr` inpRemaining) ope
+          k br'
+      | otherwise = do
+          copyBytes op ip outRemaining
+          let !ip' = ip `plusPtr` outRemaining
+          return $ bufferFull 1 ope (go ip')
+      where
+        outRemaining = ope `minusPtr` op
+        inpRemaining = ipe `minusPtr` ip
+
 
 -- Strict ByteStrings
 ------------------------------------------------------------------------------
@@ -858,6 +882,7 @@ byteStringThreshold :: Int -> S.StrictByteString -> Builder
 byteStringThreshold maxCopySize =
     \bs -> builder $ step bs
   where
+    step :: forall r. S.ByteString -> BuildStep r -> BuildStep r
     step bs@(S.BS _ len) k br@(BufferRange !op _)
       | len <= maxCopySize = byteStringCopyStep bs k br
       | otherwise          = return $ insertChunk op bs k
@@ -949,6 +974,88 @@ byteStringInsert :: S.StrictByteString -> Builder
 byteStringInsert =
     \bs -> builder $ \k (BufferRange op _) -> return $ insertChunk op bs k
 
+
+------------------------------------------------------------------------------
+-- Raw CString encoding
+------------------------------------------------------------------------------
+
+-- | Builder for raw pointers to static data of known length that will never be
+-- moved or freed.  (This is used with the static buffers GHC uses to implement
+-- ASCII string literals that do not contain null characters.)
+--
+-- @since 0.13.0.0
+{-# INLINABLE asciiLiteralCopy #-}
+asciiLiteralCopy :: Ptr Word8 -> Int -> Builder
+asciiLiteralCopy = \ !ip !len -> builder $ \k br@(BufferRange op ope) ->
+    if len <= ope `minusPtr` op
+    then copyBytes op ip len >> k (BufferRange (op `plusPtr` len) ope)
+    else wrappedBufferRangeCopyStep (BufferRange ip (ip `plusPtr` len)) k br
+
+-- | Builder for pointers to /null-terminated/ primitive UTF-8 encoded strings
+-- that may contain embedded overlong two-byte encodings of the NUL character
+-- as @0xC0 0x80@.  Other deviations from strict UTF-8 are tolerated, but the
+-- result is not well defined.
+--
+-- @since 0.13.0.0
+{-# INLINABLE modUtf8LitCopy #-}
+modUtf8LitCopy :: Ptr Word8 -> Int -> Builder
+modUtf8LitCopy !ip !len
+    | len > 0   = builder (modUtf8_step ip len)
+    | otherwise = builder id
+
+-- | Copy a /non-empty/ UTF-8 input possibly containing denormalised 2-octet
+-- sequences.  While only the NUL byte should ever encoded that way (as @0xC0
+-- 80@), this handles other denormalised @0xC0 0x??@  sequences by keeping the
+-- bottom 6 bits of the second byte.  If the input is non-UTF8 garbage, the the
+-- result may not be what the user expected.
+--
+modUtf8_step :: Ptr Word8 -> Int -> BuildStep r -> BuildStep r
+modUtf8_step !ip !len k (BufferRange op ope)
+    | op == ope = return $ bufferFull 1 op (modUtf8_step ip len k)
+    | otherwise = do
+        let !avail = ope `minusPtr` op
+            !usable = avail `min` len
+        -- null-termination makes it possible to read one more byte than the
+        -- nominal input length, with any unexpected 0xC000 ending interpreted
+        -- as a NUL.  More typically, this simplifies hanlding of inputs where
+        -- 0xC0 0x80 might otherwise be split across the "usable" input window.
+        !ch <- peekElemOff ip (usable - 1)
+        let !use | ch /= 0xC0 = usable
+                 | otherwise  = usable + 1
+        !n <- utf8_copyBytes (ip `plusPtr` use) ip op
+        let !op' = op `plusPtr` n
+            !len' = len - use
+            ip' = ip `plusPtr` use
+        if | len' <= 0 -> k (BufferRange op' ope)
+           | op' < ope -> modUtf8_step ip' len' k (BufferRange op' ope)
+           | otherwise -> return $ bufferFull 1 op' (modUtf8_step ip' len' k)
+
+-- | Consume the supplied input returning the number of bytes written
+utf8_copyBytes :: Ptr Word8 -> Ptr Word8 -> Ptr Word8 -> IO Int
+utf8_copyBytes !ipe = \ ip op -> go 0 ip op
+  where
+    go :: Int -> Ptr Word8 -> Ptr Word8 -> IO Int
+    go !n !ip@((< ipe) -> True) !op = do
+        !ch <- peek ip
+        let !ip' = ip `plusPtr` 1
+            !op' = op `plusPtr` 1
+        if | ch /= 0xC0 -> do
+               poke op ch
+               let !cnt = ipe `minusPtr` ip'
+               !runend <- S.memchr ip' 0xC0 (fromIntegral @Int cnt)
+               let !runlen | runend == nullPtr = cnt
+                           | otherwise = runend `minusPtr` ip'
+               if (runlen == 0)
+               then go (n + 1) ip' op'
+               else do
+                   copyBytes op' ip' runlen
+                   go (n + 1 + runlen) (ip' `plusPtr` runlen) (op' `plusPtr` runlen)
+           | otherwise -> do
+               !ch' <- peek ip'
+               poke op (ch' .&. 0x3f)
+               go (n + 1) (ip' `plusPtr` 1) op'
+    go !n _ _ = pure n
+
 -- Short bytestrings
 ------------------------------------------------------------------------------
 
diff --git a/Data/ByteString/Builder/Prim.hs b/Data/ByteString/Builder/Prim.hs
@@ -453,6 +453,7 @@ import           Data.ByteString.Builder.Internal
 
 import qualified Data.ByteString               as S
 import qualified Data.ByteString.Internal      as S
+import qualified Data.ByteString.Internal.Type as S
 import qualified Data.ByteString.Lazy.Internal as L
 
 import           Data.Char (ord)
@@ -464,9 +465,7 @@ import           Data.ByteString.Builder.Prim.ASCII
 
 import           Foreign
 import           Foreign.ForeignPtr.Unsafe (unsafeForeignPtrToPtr)
-import           GHC.Word (Word8 (..))
 import           GHC.Exts
-import           GHC.IO
 
 ------------------------------------------------------------------------------
 -- Creating Builders from bounded primitives
@@ -658,59 +657,22 @@ primMapLazyByteStringBounded w =
     L.foldrChunks (\x b -> primMapByteStringBounded w x `mappend` b) mempty
 
 
-------------------------------------------------------------------------------
--- Raw CString encoding
-------------------------------------------------------------------------------
-
--- | A null-terminated ASCII encoded 'Foreign.C.String.CString'.
--- Null characters are not representable.
+-- | Builder for raw 'Addr#' pointers to null-terminated primitive ASCII
+-- strings that are free of embedded null characters.
 --
 -- @since 0.11.0.0
 cstring :: Addr# -> Builder
-cstring =
-    \addr0 -> builder $ step addr0
-  where
-    step :: Addr# -> BuildStep r -> BuildStep r
-    step !addr !k br@(BufferRange op0@(Ptr op0#) ope)
-      | W8# ch == 0 = k br
-      | op0 == ope =
-          return $ bufferFull 1 op0 (step addr k)
-      | otherwise = do
-          IO $ \s -> case writeWord8OffAddr# op0# 0# ch s of
-                       s' -> (# s', () #)
-          let br' = BufferRange (op0 `plusPtr` 1) ope
-          step (addr `plusAddr#` 1#) k br'
-      where
-        !ch = indexWord8OffAddr# addr 0#
+cstring s = asciiLiteralCopy (Ptr s) (S.byteCountLiteral s)
+{-# INLINE cstring #-}
 
--- | A null-terminated UTF-8 encoded 'Foreign.C.String.CString'.
--- Null characters can be encoded as @0xc0 0x80@.
+-- | Builder for raw 'Addr#' pointers to null-terminated primitive UTF-8
+-- encoded strings in which any emebded null characters are represented via
+-- the two-byte overlong-encoding: @0xC0 0x80@.
 --
 -- @since 0.11.0.0
 cstringUtf8 :: Addr# -> Builder
-cstringUtf8 =
-    \addr0 -> builder $ step addr0
-  where
-    step :: Addr# -> BuildStep r -> BuildStep r
-    step !addr !k br@(BufferRange op0@(Ptr op0#) ope)
-      | W8# ch == 0 = k br
-      | op0 == ope =
-          return $ bufferFull 1 op0 (step addr k)
-        -- NULL is encoded as 0xc0 0x80
-      | W8# ch == 0xc0
-      , W8# (indexWord8OffAddr# addr 1#) == 0x80 = do
-          let !(W8# nullByte#) = 0
-          IO $ \s -> case writeWord8OffAddr# op0# 0# nullByte# s of
-                       s' -> (# s', () #)
-          let br' = BufferRange (op0 `plusPtr` 1) ope
-          step (addr `plusAddr#` 2#) k br'
-      | otherwise = do
-          IO $ \s -> case writeWord8OffAddr# op0# 0# ch s of
-                       s' -> (# s', () #)
-          let br' = BufferRange (op0 `plusPtr` 1) ope
-          step (addr `plusAddr#` 1#) k br'
-      where
-        !ch = indexWord8OffAddr# addr 0#
+cstringUtf8 s = modUtf8LitCopy (Ptr s) (S.byteCountLiteral s)
+{-# INLINE cstringUtf8 #-}
 
 ------------------------------------------------------------------------------
 -- Char8 encoding
diff --git a/Data/ByteString/Internal/Type.hs b/Data/ByteString/Internal/Type.hs
@@ -42,6 +42,7 @@ module Data.ByteString.Internal.Type (
         unpackChars, unpackAppendCharsLazy, unpackAppendCharsStrict,
         unsafePackAddress, unsafePackLenAddress,
         unsafePackLiteral, unsafePackLenLiteral,
+        byteCountLiteral,
 
         -- * Low level imperative construction
         empty,
@@ -475,6 +476,18 @@ unsafePackLenAddress len addr# = do
 #endif
 {-# INLINE unsafePackLenAddress #-}
 
+-- | Byte count of null-terminated primitive literal string excluding the
+-- terminating null byte.
+byteCountLiteral :: Addr# -> Int
+byteCountLiteral addr# =
+#if HS_cstringLength_AND_FinalPtr_AVAILABLE
+  I# (cstringLength# addr#)
+#else
+  fromIntegral @CSize @Int $
+    accursedUnutterablePerformIO (c_strlen (Ptr addr#))
+#endif
+{-# INLINE byteCountLiteral #-}
+
 -- | See 'unsafePackAddress'. This function has similar behavior. Prefer
 -- this function when the address in known to be an @Addr#@ literal. In
 -- that context, there is no need for the sequencing guarantees that 'IO'
diff --git a/bench/BenchAll.hs b/bench/BenchAll.hs
@@ -327,6 +327,10 @@ main = do
         , benchB'_ "ASCII String (12B)" $ asciiLit (Ptr "hello wurld!"#)
         , benchB'  "ASCII String (64B, naive)" asciiStr fromString
         , benchB'_ "ASCII String (64B)" $ asciiLit asciiBuf
+        , benchB'_ "strLit"        $ string8 asciiStr
+        , benchB'_ "stringUtf8"    $ stringUtf8 utf8Str
+        , benchB'_ "strLitInline"  $ string8 "XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX"
+        , benchB'_ "utf8LitInline" $ stringUtf8 "XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX\0XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX"
         ]
 
       , bgroup "Encoding wrappers"
diff --git a/tests/builder/Data/ByteString/Builder/Prim/Tests.hs b/tests/builder/Data/ByteString/Builder/Prim/Tests.hs
@@ -14,6 +14,7 @@ import           Data.Char  (ord)
 import qualified Data.ByteString.Lazy                  as L
 import qualified Data.ByteString.Lazy.Char8            as LC
 import           Data.ByteString.Builder
+import           Data.ByteString.Builder.Extra         as BE
 import qualified Data.ByteString.Builder.Prim          as BP
 import           Data.ByteString.Builder.Prim.TestUtils
 
@@ -22,17 +23,28 @@ import           Test.Tasty.QuickCheck
 
 tests :: [TestTree]
 tests = concat [ testsBinary, testsASCII, testsChar8, testsUtf8
-               , testsCombinatorsB, [testCString, testCStringUtf8] ]
+               , testsCombinatorsB
+               , [ testCString
+                 , testCStringUtf8 1
+                 , testCStringUtf8 6
+                 , testCStringUtf8 64
+                 ]
+               ]
 
 testCString :: TestTree
 testCString = testProperty "cstring" $
     toLazyByteString (BP.cstring "hello world!"#) ==
       LC.pack "hello" `L.append` L.singleton 0x20 `L.append` LC.pack "world!"
 
-testCStringUtf8 :: TestTree
-testCStringUtf8 = testProperty "cstringUtf8" $
-    toLazyByteString (BP.cstringUtf8 "hello\xc0\x80world!"#) ==
-      LC.pack "hello" `L.append` L.singleton 0x00 `L.append` LC.pack "world!"
+testCStringUtf8 :: Int -> TestTree
+testCStringUtf8 sz = testProperty "cstringUtf8" $
+    BE.toLazyByteStringWith (BE.untrimmedStrategy sz sz) L.empty
+      (BP.cstringUtf8 "hello\xc0\x80\xc0\x80\xd0\xbc\xd0\xb8\xd1\x80\xc0\x80\xC0"#) ==
+      LC.pack "hello" `L.append` L.singleton 0x00
+                      `L.append` L.singleton 0x00
+                      `L.append` LC.pack "\xd0\xbc\xd0\xb8\xd1\x80"
+                      `L.append` L.singleton 0x00
+                      `L.append` L.singleton 0x00
 
 ------------------------------------------------------------------------------
 -- Binary