77 using Tokenization . Scanner ;
88 using Tokens ;
99
10- internal class XrefOffsetValidator
10+ internal sealed class XrefOffsetValidator
1111 {
12- private static readonly long MinimumSearchOffset = 6 ;
12+ private const long MinimumSearchOffset = 6 ;
13+
14+ private static ReadOnlySpan < byte > XRefBytes => "xref"u8 ;
15+ private static ReadOnlySpan < byte > SpaceObjBytes => " obj"u8 ;
1316
1417 private readonly ILog log ;
1518
19+ private List < long > ? bfSearchStartXRefTablesOffsets ;
1620 private List < long > ? bfSearchXRefTablesOffsets ;
1721 private List < long > ? bfSearchXRefStreamsOffsets ;
1822
@@ -90,16 +94,18 @@ private long BruteForceSearchForXref(long xrefOffset, ISeekableTokenScanner scan
9094
9195 BfSearchForXRefStreams ( reader ) ;
9296
93- if ( bfSearchXRefTablesOffsets != null )
97+ if ( bfSearchXRefTablesOffsets != null && bfSearchXRefTablesOffsets . Count > 0 )
9498 {
9599 // TODO to be optimized, this won't work in every case
96100 newOffsetTable = SearchNearestValue ( bfSearchXRefTablesOffsets , xrefOffset ) ;
97101 }
98- if ( bfSearchXRefStreamsOffsets != null )
102+
103+ if ( bfSearchXRefStreamsOffsets != null && bfSearchXRefStreamsOffsets . Count > 0 )
99104 {
100105 // TODO to be optimized, this won't work in every case
101106 newOffsetStream = SearchNearestValue ( bfSearchXRefStreamsOffsets , xrefOffset ) ;
102107 }
108+
103109 // choose the nearest value
104110 if ( newOffsetTable > - 1 && newOffsetStream > - 1 )
105111 {
@@ -126,9 +132,91 @@ private long BruteForceSearchForXref(long xrefOffset, ISeekableTokenScanner scan
126132 newOffset = newOffsetStream ;
127133 bfSearchXRefStreamsOffsets ! . Remove ( newOffsetStream ) ;
128134 }
135+ else
136+ {
137+ log . Warn ( "Trying to repair xref offset by looking for all startxref." ) ;
138+ if ( TryBruteForceSearchForXrefFromStartxref ( xrefOffset , scanner , reader , out long newOffsetFromStartxref ) )
139+ {
140+ newOffset = newOffsetFromStartxref ;
141+ }
142+ }
143+
129144 return newOffset ;
130145 }
131146
147+ private bool TryBruteForceSearchForXrefFromStartxref ( long xrefOffset , ISeekableTokenScanner scanner , IInputBytes reader , out long newOffset )
148+ {
149+ newOffset = - 1 ;
150+ BruteForceSearchForStartxref ( reader ) ;
151+ long newStartXRefOffset = SearchNearestValue ( bfSearchStartXRefTablesOffsets , xrefOffset ) ;
152+ if ( newStartXRefOffset < reader . Length )
153+ {
154+ long tempNewOffset = - 1 ;
155+ var startOffset = scanner . CurrentPosition ;
156+ scanner . Seek ( newStartXRefOffset + 9 ) ;
157+
158+ if ( scanner . MoveNext ( ) && scanner . CurrentToken is NumericToken token )
159+ {
160+ tempNewOffset = token . Long ;
161+ }
162+
163+ if ( tempNewOffset > - 1 )
164+ {
165+ scanner . Seek ( tempNewOffset ) ;
166+ scanner . MoveNext ( ) ;
167+ if ( ReferenceEquals ( scanner . CurrentToken , OperatorToken . Xref ) )
168+ {
169+ newOffset = tempNewOffset ;
170+ }
171+
172+ if ( CheckXRefStreamOffset ( tempNewOffset , scanner , true ) )
173+ {
174+ newOffset = tempNewOffset ;
175+ }
176+ }
177+
178+ scanner . Seek ( startOffset ) ;
179+ }
180+
181+ return newOffset != - 1 ;
182+ }
183+
184+ private void BruteForceSearchForStartxref ( IInputBytes bytes )
185+ {
186+ if ( bfSearchStartXRefTablesOffsets != null )
187+ {
188+ return ;
189+ }
190+
191+ // a pdf may contain more than one startxref entry
192+ bfSearchStartXRefTablesOffsets = new List < long > ( ) ;
193+
194+ var startOffset = bytes . CurrentOffset ;
195+
196+ bytes . Seek ( MinimumSearchOffset ) ;
197+
198+ // search for startxref
199+ while ( bytes . MoveNext ( ) && ! bytes . IsAtEnd ( ) )
200+ {
201+ if ( ReadHelper . IsString ( bytes , FileTrailerParser . StartXRefBytes ) )
202+ {
203+ var newOffset = bytes . CurrentOffset ;
204+
205+ bytes . Seek ( newOffset - 1 ) ;
206+
207+ if ( ReadHelper . IsWhitespace ( bytes . CurrentByte ) )
208+ {
209+ bfSearchStartXRefTablesOffsets . Add ( newOffset ) ;
210+ }
211+
212+ bytes . Seek ( newOffset + 9 ) ;
213+ }
214+
215+ }
216+
217+ bytes . Seek ( startOffset ) ;
218+ }
219+
132220 private void BruteForceSearchForTables ( IInputBytes bytes )
133221 {
134222 if ( bfSearchXRefTablesOffsets != null )
@@ -146,7 +234,7 @@ private void BruteForceSearchForTables(IInputBytes bytes)
146234 // search for xref tables
147235 while ( bytes . MoveNext ( ) && ! bytes . IsAtEnd ( ) )
148236 {
149- if ( ReadHelper . IsString ( bytes , "xref" ) )
237+ if ( ReadHelper . IsString ( bytes , XRefBytes ) )
150238 {
151239 var newOffset = bytes . CurrentOffset ;
152240
@@ -180,11 +268,9 @@ private void BfSearchForXRefStreams(IInputBytes bytes)
180268 bytes . Seek ( MinimumSearchOffset ) ;
181269
182270 // search for XRef streams
183- var objString = " obj" ;
184-
185271 while ( bytes . MoveNext ( ) && ! bytes . IsAtEnd ( ) )
186272 {
187- if ( ! ReadHelper . IsString ( bytes , "xref" ) )
273+ if ( ! ReadHelper . IsString ( bytes , XRefBytes ) )
188274 {
189275 continue ;
190276 }
@@ -209,7 +295,7 @@ private void BfSearchForXRefStreams(IInputBytes bytes)
209295
210296 for ( int j = 0 ; j < 10 ; j ++ )
211297 {
212- if ( ReadHelper . IsString ( bytes , objString ) )
298+ if ( ReadHelper . IsString ( bytes , SpaceObjBytes ) )
213299 {
214300 long tempOffset = currentOffset - 1 ;
215301
@@ -224,7 +310,7 @@ private void BfSearchForXRefStreams(IInputBytes bytes)
224310 bytes . Seek ( tempOffset ) ;
225311
226312 // is the digit preceded by a space?
227- if ( ReadHelper . IsSpace ( bytes . CurrentByte ) )
313+ if ( ReadHelper . IsWhitespace ( bytes . CurrentByte ) )
228314 {
229315 int length = 0 ;
230316 bytes . Seek ( -- tempOffset ) ;
0 commit comments