Skip to content

Commit 9095d16

Browse files
committed
Allow for subdirectories, add dom cleaning
1 parent 35d7961 commit 9095d16

File tree

5 files changed

+38
-20
lines changed

5 files changed

+38
-20
lines changed

Makefile

Lines changed: 21 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -1,33 +1,41 @@
11
DATA_DIR = ../data
22
ANNOTATIONS_DIR = $(DATA_DIR)/annotations
3-
ARCHIVES_DIR = $(DATA_DIR)/arxiv/sources
4-
UNPACKED_DIR = $(DATA_DIR)/arxiv/unpacked_sources
5-
HTMLS_DIR = $(DATA_DIR)/arxiv/htmls
6-
TABLES_DIR = $(DATA_DIR)/arxiv/tables
3+
ARXIV_DIR = $(DATA_DIR)/arxiv
4+
ARCHIVES_DIR = $(ARXIV_DIR)/sources
5+
UNPACKED_DIR = $(ARXIV_DIR)/unpacked_sources
6+
HTMLS_DIR = $(ARXIV_DIR)/htmls
7+
FIXED_HTMLS_DIR = $(ARXIV_DIR)/htmls-clean
8+
TABLES_DIR = $(ARXIV_DIR)/tables
79

8-
ARCHIVES = $(wildcard $(ARCHIVES_DIR)/*)
9-
UNPACKS = $(patsubst $(ARCHIVES_DIR)/%,$(UNPACKED_DIR)/%,$(ARCHIVES))
10-
HTMLS = $(patsubst $(ARCHIVES_DIR)/%,$(HTMLS_DIR)/%.html,$(ARCHIVES))
11-
TABLES = $(patsubst $(ARCHIVES_DIR)/%,$(TABLES_DIR)/%,$(ARCHIVES))
10+
ARCHIVES = $(wildcard $(ARCHIVES_DIR)/**/*.gz)
11+
UNPACKS = $(patsubst $(ARCHIVES_DIR)/%.gz,$(UNPACKED_DIR)/%,$(ARCHIVES))
12+
HTMLS = $(patsubst $(ARCHIVES_DIR)/%.gz,$(HTMLS_DIR)/%.html,$(ARCHIVES))
13+
FIXED_HTMLS = $(patsubst $(ARCHIVES_DIR)/%.gz,$(FIXED_HTMLS_DIR)/%.html,$(ARCHIVES))
14+
TABLES = $(patsubst $(ARCHIVES_DIR)/%.gz,$(TABLES_DIR)/%,$(ARCHIVES))
1215

13-
$(shell mkdir -p "$(DATA_DIR)" "$(HTMLS_DIR)" "$(TABLES_DIR)")
16+
$(shell mkdir -p "$(DATA_DIR)" "$(UNPACKED_DIR)" "$(HTMLS_DIR)" "$(FIXED_HTMLS_DIR)" "$(TABLES_DIR)")
1417

1518
.PHONY: all
1619
all: $(ANNOTATIONS_DIR)/pdfs-urls.csv $(ANNOTATIONS_DIR)/sources-urls.csv extract_all
1720

1821
extract_all: $(TABLES)
1922

23+
fix_htmls_all: $(FIXED_HTMLS)
24+
2025
convert_all: $(HTMLS)
2126

22-
$(TABLES): $(TABLES_DIR)/%: $(HTMLS_DIR)/%.html
23-
python ./extract_tables.py $^ --outdir $(TABLES_DIR)
27+
$(TABLES): $(TABLES_DIR)/%: $(FIXED_HTMLS_DIR)/%.html
28+
python ./extract_tables.py $^ --outdir $@
29+
30+
$(FIXED_HTMLS): $(FIXED_HTMLS_DIR)/%: $(HTMLS_DIR)/%
31+
./clean_html.sh $^ $@
2432

2533
$(HTMLS): $(HTMLS_DIR)/%.html: $(UNPACKED_DIR)/%
26-
./docker-latex2html.sh $(HTMLS_DIR) $^
34+
./docker-latex2html.sh $^ $@
2735

2836
unpack_all: $(UNPACKS)
2937

30-
$(UNPACKS): $(UNPACKED_DIR)/%: $(ARCHIVES_DIR)/%
38+
$(UNPACKS): $(UNPACKED_DIR)/%: $(ARCHIVES_DIR)/%.gz
3139
./unpack-sources.sh $^ $@
3240

3341
$(ANNOTATIONS_DIR)/pdfs-urls.csv: $(ANNOTATIONS_DIR)/papers-urls.csv

clean_html.sh

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,7 @@
1+
#!/usr/bin/env bash
2+
SOURCE=$(realpath "$1")
3+
[ ! -f "$SOURCE" ] && echo "File $SOURCE not found." >&2 && exit 1
4+
mkdir -p $(dirname "$2")
5+
OUTPUT=$(realpath "$2")
6+
7+
docker run --rm -v "$SOURCE":/files/index.html:ro --entrypoint '' zenika/alpine-chrome:73 chromium-browser --headless --disable-gpu --disable-software-rasterizer --no-sandbox --dump-dom /files/index.html > "$OUTPUT"

docker-latex2html.sh

Lines changed: 7 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,9 @@
11
#!/usr/bin/env bash
2-
OUTPUT_DIR=$(realpath "$1")
3-
ARCHIVE=$(realpath "$2")
4-
FILENAME=$(basename "$ARCHIVE")
2+
SOURCE_DIR=$(realpath "$1") #~/arxiv/unpacked/1701/1701.xyz
3+
[ ! -d "$SOURCE_DIR" ] && echo "Directory $SOURCE_DIR not found." >&2 && exit 1
4+
mkdir -p $(dirname "$2")
5+
OUTPUT=$(realpath "$2") #~/arxiv/htmls/1701/1701.xyz.html
6+
OUTPUT_DIR=$(dirname "$OUTPUT") #~/arxiv/htmls/1701
7+
FILENAME=$(basename "$OUTPUT") #1701.xyz.html
58

6-
docker run --rm --stop-timeout 60 -v $PWD/latex2html.sh:/files/latex2html.sh:ro -v "$ARCHIVE":/files/ro-source:ro -v "$OUTPUT_DIR":/files/htmls niccokunzmann/ci-latex /files/latex2html.sh "$FILENAME"
9+
docker run --rm -v $PWD/latex2html.sh:/files/latex2html.sh:ro -v "$SOURCE_DIR":/files/ro-source:ro -v "$OUTPUT_DIR":/files/htmls niccokunzmann/ci-latex /files/latex2html.sh "$FILENAME"

extract_tables.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -114,7 +114,7 @@ def set_ids_by_labels(soup):
114114
def extract_tables(filename, outdir):
115115
with open(filename, "rb") as f:
116116
html = f.read()
117-
outdir = Path(outdir) / Path(filename).stem
117+
outdir = Path(outdir)
118118
outdir.mkdir(parents=True, exist_ok=True)
119119
soup = BeautifulSoup(html, "lxml")
120120
flatten_tables(soup)

latex2html.sh

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -9,8 +9,8 @@ cp -r "$RO_SOURCE_DIR" "$SOURCE_DIR"
99
cd "$SOURCE_DIR"
1010
MAINTEX=$(find . -type f -iname "*.tex" -print0 | xargs -0 grep -l documentclass | head -1)
1111
echo $MAINTEX
12-
timeout -s KILL 30 htlatex "$MAINTEX" '' '' '' '-interaction=nonstopmode'
12+
timeout -s KILL 60 htlatex "$MAINTEX" '' '' '' '-interaction=nonstopmode'
1313

1414
FILENAME=$(basename $MAINTEX)
1515
FILENAME="${FILENAME%.tex}.html"
16-
cp "$SOURCE_DIR/$FILENAME" "$OUTPUT_DIR/$OUTNAME.html"
16+
cp "$SOURCE_DIR/$FILENAME" "$OUTPUT_DIR/$OUTNAME"

0 commit comments

Comments
 (0)