1+ name : Run Common Crawl Tests
2+
3+ on :
4+ push :
5+ branches : [master]
6+ workflow_dispatch :
7+ workflow_call :
8+
9+ jobs :
10+ build :
11+ runs-on : ubuntu-latest
12+ steps :
13+ - uses : actions/checkout@v2
14+
15+ - name : Set up dotnet core
16+ uses : actions/setup-dotnet@v3
17+ with :
18+ dotnet-version : " 8.0.x"
19+
20+ - name : Restore corpus cache 0000, 0001
21+ id : restore-corpus
22+ uses : actions/cache@v4
23+ with :
24+ path : corpus/
25+ key : ${{ runner.os }}-pdf-corpus-0000-0001
26+
27+ - name : Download corpus if cache missed 0000, 0001
28+ if : steps.restore-corpus.outputs.cache-hit != 'true'
29+ run : |
30+ mkdir -p corpus/zipfiles
31+ cd corpus/zipfiles
32+ echo "Downloading 0000.zip"
33+ wget -nv https://digitalcorpora.s3.amazonaws.com/corpora/files/CC-MAIN-2021-31-PDF-UNTRUNCATED/zipfiles/0000-0999/0000.zip -O 0000.zip
34+ wget -nv https://digitalcorpora.s3.amazonaws.com/corpora/files/CC-MAIN-2021-31-PDF-UNTRUNCATED/zipfiles/0000-0999/0001.zip -O 0001.zip
35+ cd ..
36+ unzip 'zipfiles/*.zip' -d extracted
37+ run: rm -f zipfiles/*.zip
38+
39+ - name : Remove unwanted test files
40+ run : |
41+ skip_files=(
42+ "corpus/extracted/0000399.pdf"
43+ "corpus/extracted/0000819.pdf"
44+ "corpus/extracted/0000920.pdf"
45+ "corpus/extracted/0000300.pdf"
46+ "corpus/extracted/0001589.pdf"
47+ "corpus/extracted/0001957.pdf"
48+ )
49+
50+ for file in "${skip_files[@]}"; do
51+ if [ -f "$file" ]; then
52+ echo "Removing $file"
53+ rm "$file"
54+ fi
55+ done
56+
57+ - name : Run tests against corpus
58+ run : dotnet run --project tools/UglyToad.PdfPig.ConsoleRunner/UglyToad.PdfPig.ConsoleRunner.csproj "corpus/extracted"
0 commit comments