Skip to content

Commit 4bf746c

Browse files
EliotJonesBobLd
authored andcommitted
add new action to run integration against common crawl corpus
1 parent bffd514 commit 4bf746c

File tree

1 file changed

+58
-0
lines changed

1 file changed

+58
-0
lines changed
Lines changed: 58 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,58 @@
1+
name: Run Common Crawl Tests
2+
3+
on:
4+
push:
5+
branches: [master]
6+
workflow_dispatch:
7+
workflow_call:
8+
9+
jobs:
10+
build:
11+
runs-on: ubuntu-latest
12+
steps:
13+
- uses: actions/checkout@v2
14+
15+
- name: Set up dotnet core
16+
uses: actions/setup-dotnet@v3
17+
with:
18+
dotnet-version: "8.0.x"
19+
20+
- name: Restore corpus cache 0000, 0001
21+
id: restore-corpus
22+
uses: actions/cache@v4
23+
with:
24+
path: corpus/
25+
key: ${{ runner.os }}-pdf-corpus-0000-0001
26+
27+
- name: Download corpus if cache missed 0000, 0001
28+
if: steps.restore-corpus.outputs.cache-hit != 'true'
29+
run: |
30+
mkdir -p corpus/zipfiles
31+
cd corpus/zipfiles
32+
echo "Downloading 0000.zip"
33+
wget -nv https://digitalcorpora.s3.amazonaws.com/corpora/files/CC-MAIN-2021-31-PDF-UNTRUNCATED/zipfiles/0000-0999/0000.zip -O 0000.zip
34+
wget -nv https://digitalcorpora.s3.amazonaws.com/corpora/files/CC-MAIN-2021-31-PDF-UNTRUNCATED/zipfiles/0000-0999/0001.zip -O 0001.zip
35+
cd ..
36+
unzip 'zipfiles/*.zip' -d extracted
37+
run: rm -f zipfiles/*.zip
38+
39+
- name: Remove unwanted test files
40+
run: |
41+
skip_files=(
42+
"corpus/extracted/0000399.pdf"
43+
"corpus/extracted/0000819.pdf"
44+
"corpus/extracted/0000920.pdf"
45+
"corpus/extracted/0000300.pdf"
46+
"corpus/extracted/0001589.pdf"
47+
"corpus/extracted/0001957.pdf"
48+
)
49+
50+
for file in "${skip_files[@]}"; do
51+
if [ -f "$file" ]; then
52+
echo "Removing $file"
53+
rm "$file"
54+
fi
55+
done
56+
57+
- name: Run tests against corpus
58+
run: dotnet run --project tools/UglyToad.PdfPig.ConsoleRunner/UglyToad.PdfPig.ConsoleRunner.csproj "corpus/extracted"

0 commit comments

Comments
 (0)