Skip to content

Commit a404724

Browse files
authored
Additional digital corpora testing (#1261)
* add additional testing pairs 0010-0011 for integration some of these files required the skip missing fonts flag set to true - propagate use lenient parsing for dictionaries inside arrays, handles a corrupt file 0012710 not in this test set * add pair 0012-0013
1 parent 34db05f commit a404724

File tree

6 files changed

+36
-7
lines changed

6 files changed

+36
-7
lines changed

.github/workflows/run_common_crawl_tests.yml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -12,7 +12,7 @@ jobs:
1212
strategy:
1313
fail-fast: false
1414
matrix:
15-
pair: ["0000-0001", "0002-0003", "0004-0005", "0006-0007", "0008-0009"]
15+
pair: ["0000-0001", "0002-0003", "0004-0005", "0006-0007", "0008-0009", "0010-0011", "0012-0013"]
1616

1717
steps:
1818
- uses: actions/checkout@v2

src/UglyToad.PdfPig.Tests/Tokenization/ArrayTokenizerTests.cs

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,7 @@
66

77
public class ArrayTokenizerTests
88
{
9-
private readonly ArrayTokenizer tokenizer = new ArrayTokenizer(true, new StackDepthGuard(256));
9+
private readonly ArrayTokenizer tokenizer = new ArrayTokenizer(true, new StackDepthGuard(256), false);
1010

1111
[Theory]
1212
[InlineData("]")]

src/UglyToad.PdfPig.Tokenization/ArrayTokenizer.cs

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -9,13 +9,15 @@ internal sealed class ArrayTokenizer : ITokenizer
99
{
1010
private readonly bool usePdfDocEncoding;
1111
private readonly StackDepthGuard stackDepthGuard;
12+
private readonly bool useLenientParsing;
1213

1314
public bool ReadsNextByte { get; } = false;
1415

15-
public ArrayTokenizer(bool usePdfDocEncoding, StackDepthGuard stackDepthGuard)
16+
public ArrayTokenizer(bool usePdfDocEncoding, StackDepthGuard stackDepthGuard, bool useLenientParsing)
1617
{
1718
this.usePdfDocEncoding = usePdfDocEncoding;
1819
this.stackDepthGuard = stackDepthGuard;
20+
this.useLenientParsing = useLenientParsing;
1921
}
2022

2123
public bool TryTokenize(byte currentByte, IInputBytes inputBytes, out IToken token)
@@ -27,7 +29,7 @@ public bool TryTokenize(byte currentByte, IInputBytes inputBytes, out IToken tok
2729
return false;
2830
}
2931

30-
var scanner = new CoreTokenScanner(inputBytes, usePdfDocEncoding, stackDepthGuard, ScannerScope.Array);
32+
var scanner = new CoreTokenScanner(inputBytes, usePdfDocEncoding, stackDepthGuard, ScannerScope.Array, useLenientParsing: useLenientParsing);
3133

3234
var contents = new List<IToken>();
3335

src/UglyToad.PdfPig.Tokenization/Scanner/CoreTokenScanner.cs

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -70,7 +70,7 @@ public CoreTokenScanner(
7070
this.usePdfDocEncoding = usePdfDocEncoding;
7171
this.stackDepthGuard = stackDepthGuard;
7272
this.stringTokenizer = new StringTokenizer(usePdfDocEncoding);
73-
this.arrayTokenizer = new ArrayTokenizer(usePdfDocEncoding, this.stackDepthGuard);
73+
this.arrayTokenizer = new ArrayTokenizer(usePdfDocEncoding, this.stackDepthGuard, useLenientParsing);
7474
this.dictionaryTokenizer = new DictionaryTokenizer(usePdfDocEncoding, this.stackDepthGuard, useLenientParsing: useLenientParsing);
7575
this.scope = scope;
7676
this.namedDictionaryRequiredKeys = namedDictionaryRequiredKeys;

tools/UglyToad.PdfPig.ConsoleRunner/Program.cs

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -185,7 +185,11 @@ public static int Main(string[] args)
185185
sw.Reset();
186186
sw.Start();
187187

188-
using (var pdfDocument = PdfDocument.Open(file))
188+
using (var pdfDocument = PdfDocument.Open(file, new ParsingOptions
189+
{
190+
UseLenientParsing = true,
191+
SkipMissingFonts = true,
192+
}))
189193
{
190194
sw.Stop();
191195

tools/common-crawl-ignore.txt

Lines changed: 24 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -46,4 +46,27 @@
4646
0009309.pdf
4747
0009464.pdf
4848
0009706.pdf
49-
0009944.pdf
49+
0009944.pdf
50+
0010114.pdf
51+
0010117.pdf
52+
0010216.pdf
53+
0010472.pdf
54+
0010697.pdf
55+
0010902.pdf
56+
0010950.pdf
57+
0011041.pdf
58+
0011171.pdf
59+
0011398.pdf
60+
0011450.pdf
61+
0011758.pdf
62+
0011989.pdf
63+
0012117.pdf
64+
0012684.pdf
65+
0012730.pdf
66+
0013051.pdf
67+
0013178.pdf
68+
0013338.pdf
69+
0013425.pdf
70+
0013587.pdf
71+
0013721.pdf
72+
0013822.pdf

0 commit comments

Comments
 (0)