Optimize the FirstPassParser to just fetch a final chunk before doing things char-by-char backwards

rhuijben · rhuijben · commit 7c9019bda78d · 2025-10-16T16:46:16.000+02:00
diff --git a/src/UglyToad.PdfPig.Fonts/Type1/Parser/Type1Tokenizer.cs b/src/UglyToad.PdfPig.Fonts/Type1/Parser/Type1Tokenizer.cs
@@ -395,9 +395,10 @@ private string ReadComment()
         private Type1DataToken ReadCharString(int length)
         {
             // Skip preceding space.
-            bytes.MoveNext();
-            // TODO: may be wrong
-            // bytes.MoveNext();
+            if (bytes.Peek() is { } ws && ReadHelper.IsWhitespace(ws))
+            {
+                bytes.MoveNext();
+            }
 
             byte[] data = new byte[length];
             for (int i = 0; i < length; i++)
diff --git a/src/UglyToad.PdfPig/Parser/FileStructure/FirstPassParser.StartXref.cs b/src/UglyToad.PdfPig/Parser/FileStructure/FirstPassParser.StartXref.cs
@@ -10,13 +10,56 @@ internal static partial class FirstPassParser
 {
     private static ReadOnlySpan<byte> StartXRefBytes => "startxref"u8;
 
+    public const long EndOfFileBufferSize = 1024;
+
     public static StartXRefLocation GetFirstCrossReferenceOffset(
         IInputBytes bytes,
         ISeekableTokenScanner scanner,
         ILog log)
     {
+        // We used to read backward through the file, but this is quite expensive for streams that directly wrap OS files.
+        // Instead we fetch the last 1024 bytes of the file and do a memory search, as cheap first attempt. This is significantly faster
+        // in practice, if there is no in-process caching of the file involved
+        // 
+        // If that fails (in practice it should never) we fall back to the old method of reading backwards.
         var fileLength = bytes.Length;
+        {
+            var fetchFrom = Math.Max(bytes.Length - EndOfFileBufferSize, 0L);
+
+            bytes.Seek(fetchFrom);
+
+            Span<byte> byteBuffer = new byte[bytes.Length - fetchFrom];   // TODO: Maybe use PoolArray?
+
+            int n = bytes.Read(byteBuffer);
+
+            if (n == byteBuffer.Length)
+            {
+                int lx = byteBuffer.LastIndexOf("startxref"u8);
+
+                if (lx < 0)
+                {
+                    // See old code. We also try a mangled version
+                    lx = byteBuffer.LastIndexOf("startref"u8);
+                }
+
+                if (lx >= 0)
+                {
+                    scanner.Seek(fetchFrom + lx);
+
+                    if (scanner.TryReadToken(out OperatorToken startXrefOp) && (startXrefOp.Data == "startxref" || startXrefOp.Data == "startref"))
+                    {
+                        var pos = GetNumericTokenFollowingCurrent(scanner);
+
+                        log.Debug($"Found startxref at {pos}");
+
+                        return new StartXRefLocation(fetchFrom + lx, pos);
+                    }
+                }
+
+            }
+        }
 
+        // Now fall through in the old code
         var buffer = new CircularByteBuffer(StartXRefBytes.Length);
 
         // Start from the end of the file
diff --git a/src/UglyToad.PdfPig/Parser/Parts/BruteForceSearcher.cs b/src/UglyToad.PdfPig/Parser/Parts/BruteForceSearcher.cs
@@ -57,7 +57,7 @@ public static IReadOnlyDictionary<IndirectReference, long> GetObjectLocations(II
                     {
                         var next = bytes.Peek();
 
-                        if (next.HasValue && next == 'n')
+                        if (next == 'n')
                         {
                             if (ReadHelper.IsString(bytes, "endobj"))
                             {

Original file line number	Diff line number	Diff line change
`@@ -57,7 +57,7 @@ public static IReadOnlyDictionary<IndirectReference, long> GetObjectLocations(II`
`57`	`57`	`{`
`58`	`58`	`var next = bytes.Peek();`
`59`	`59`
`60`		`- if (next.HasValue && next == 'n')`
	`60`	`+ if (next == 'n')`
`61`	`61`	`{`
`62`	`62`	`if (ReadHelper.IsString(bytes, "endobj"))`
`63`	`63`	`{`