PdfPig/src/UglyToad.PdfPig/Parser/FileStructure/FileTrailerParser.cs at 603f8c7b4c8f3a86282ba15a11c753db0d6520bf · BobLd/PdfPig · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
namespace UglyToad.PdfPig.Parser.FileStructure
{
    using System;
    using Core;
    using Tokenization.Scanner;
    using Tokens;

    /*
     * The trailer of a PDF file allows us to quickly find the cross-reference table and other special objects.
     * Readers should read a PDF file from its end.
     * The last line of the file should contain the end-of-file marker, %%EOF.
     * The two preceding lines should be the keyword startxref and the byte offset of the cross-reference section from the start of the document.
     * The startxref line might be preceded by the trailer dictionary of the form:
     * trailer
     * <</key1 value1/key2 value2/key3 value3/key4 value4>>
     * startxref
     * byte-offset
     * %%EOF
     */

    internal static class FileTrailerParser
    {
        /// <summary>
        /// The %%EOF may be further back in the file.
        /// </summary>
        private const int EndOfFileSearchRange = 2048;

        internal static ReadOnlySpan<byte> StartXRefBytes => "startxref"u8;

        public static long GetFirstCrossReferenceOffset(IInputBytes bytes, ISeekableTokenScanner scanner, bool isLenientParsing)
        {
            if (bytes is null)
            {
                throw new ArgumentNullException(nameof(bytes));
            }

            if (scanner is null)
            {
                throw new ArgumentNullException(nameof(scanner));
            }

            var fileLength = bytes.Length;

            var offsetFromEnd = fileLength < EndOfFileSearchRange ? (int)fileLength : EndOfFileSearchRange;

            var startXrefPosition = GetStartXrefPosition(bytes, offsetFromEnd);

            scanner.Seek(startXrefPosition);

            if (!scanner.TryReadToken(out OperatorToken startXrefToken) || startXrefToken.Data != "startxref")
            {
                throw new InvalidOperationException($"The start xref position we found was not correct. Found {startXrefPosition} but it was occupied by token {scanner.CurrentToken}.");
            }

            NumericToken? numeric = null;
            while (scanner.MoveNext())
            {
                if (scanner.CurrentToken is NumericToken token)
                {
                    numeric = token;
                    break;
                }

                if (!(scanner.CurrentToken is CommentToken))
                {
                    throw new PdfDocumentFormatException($"Found an unexpected token following 'startxref': {scanner.CurrentToken}.");
                }
            }

            if (numeric is null)
            {
                throw new PdfDocumentFormatException($"Could not find the numeric value following 'startxref'. Searching from position {startXrefPosition}.");
            }

            return numeric.Long;
        }

        private static long GetStartXrefPosition(IInputBytes bytes, int chunkSize)
        {
            // Initialize startpos to the end to get the loop below started
            var startPos = bytes.Length;

            do
            {
                // Make a sliding-window search region where each subsequent search will look further
                // back and not search in the already searched chunks. Make sure to search just beyond
                // the chunk to account for the possibility of startxref crossing chunk-boundaries.
                // The start-position is inclusive and the end-position is exclusive for the chunk.
                // Each search will look in an increasingly bigger chunk, doubling every time.
                var endPos = Math.Min(startPos + StartXRefBytes.Length, bytes.Length);
                startPos = Math.Max(0, endPos - chunkSize);
                chunkSize *= 2;

                // Prepare to search this region; mark startXrefPos as "not found".
                bytes.Seek(startPos);
                var startXrefPos = -1L;
                var index = 0;

                // Starting scanning the file bytes.
                while (bytes.CurrentOffset < endPos && bytes.MoveNext())
                {
                    if (bytes.CurrentByte == StartXRefBytes[index])
                    {
                        // We might be reading "startxref".
                        if (++index == StartXRefBytes.Length)
                        {
                            // Set this "startxref" (position from the start of the document to the first 's').
                            startXrefPos = (int)bytes.CurrentOffset - StartXRefBytes.Length;

                            // Continue scanning to make sure we find the last startxref in case there are more
                            // that just one, which can be the case for incrementally updated PDFs with multiple
                            // generations of sections.
                            index = 0;
                        }
                    }
                    else
                    {
                        // Not a match for "startxref" so set index back to 0
                        index = 0;
                    }
                }

                // If we found a startxref then we're done.
                if (startXrefPos >= 0)
                {
                    return startXrefPos;
                }

            } while (startPos > 0); // Keep on searching until we've read from the very start.

            // No startxref position was found.
            throw new PdfDocumentFormatException($"Could not find the startxref");
        }
    }
}