|
1 | 1 | DATA_DIR = ../data
|
2 | 2 | ANNOTATIONS_DIR = $(DATA_DIR)/annotations
|
3 |
| -ARCHIVES_DIR = $(DATA_DIR)/arxiv/sources |
4 |
| -UNPACKED_DIR = $(DATA_DIR)/arxiv/unpacked_sources |
5 |
| -HTMLS_DIR = $(DATA_DIR)/arxiv/htmls |
6 |
| -TABLES_DIR = $(DATA_DIR)/arxiv/tables |
| 3 | +ARXIV_DIR = $(DATA_DIR)/arxiv |
| 4 | +ARCHIVES_DIR = $(ARXIV_DIR)/sources |
| 5 | +UNPACKED_DIR = $(ARXIV_DIR)/unpacked_sources |
| 6 | +HTMLS_DIR = $(ARXIV_DIR)/htmls |
| 7 | +FIXED_HTMLS_DIR = $(ARXIV_DIR)/htmls-clean |
| 8 | +TABLES_DIR = $(ARXIV_DIR)/tables |
7 | 9 |
|
8 |
| -ARCHIVES = $(wildcard $(ARCHIVES_DIR)/*) |
9 |
| -UNPACKS = $(patsubst $(ARCHIVES_DIR)/%,$(UNPACKED_DIR)/%,$(ARCHIVES)) |
10 |
| -HTMLS = $(patsubst $(ARCHIVES_DIR)/%,$(HTMLS_DIR)/%.html,$(ARCHIVES)) |
11 |
| -TABLES = $(patsubst $(ARCHIVES_DIR)/%,$(TABLES_DIR)/%,$(ARCHIVES)) |
| 10 | +ARCHIVES = $(wildcard $(ARCHIVES_DIR)/**/*.gz) |
| 11 | +UNPACKS = $(patsubst $(ARCHIVES_DIR)/%.gz,$(UNPACKED_DIR)/%,$(ARCHIVES)) |
| 12 | +HTMLS = $(patsubst $(ARCHIVES_DIR)/%.gz,$(HTMLS_DIR)/%.html,$(ARCHIVES)) |
| 13 | +FIXED_HTMLS = $(patsubst $(ARCHIVES_DIR)/%.gz,$(FIXED_HTMLS_DIR)/%.html,$(ARCHIVES)) |
| 14 | +TABLES = $(patsubst $(ARCHIVES_DIR)/%.gz,$(TABLES_DIR)/%,$(ARCHIVES)) |
12 | 15 |
|
13 |
| -$(shell mkdir -p "$(DATA_DIR)" "$(HTMLS_DIR)" "$(TABLES_DIR)") |
| 16 | +$(shell mkdir -p "$(DATA_DIR)" "$(UNPACKED_DIR)" "$(HTMLS_DIR)" "$(FIXED_HTMLS_DIR)" "$(TABLES_DIR)") |
14 | 17 |
|
15 | 18 | .PHONY: all
|
16 | 19 | all: $(ANNOTATIONS_DIR)/pdfs-urls.csv $(ANNOTATIONS_DIR)/sources-urls.csv extract_all
|
17 | 20 |
|
18 | 21 | extract_all: $(TABLES)
|
19 | 22 |
|
| 23 | +fix_htmls_all: $(FIXED_HTMLS) |
| 24 | + |
20 | 25 | convert_all: $(HTMLS)
|
21 | 26 |
|
22 |
| -$(TABLES): $(TABLES_DIR)/%: $(HTMLS_DIR)/%.html |
23 |
| - python ./extract_tables.py $^ --outdir $(TABLES_DIR) |
| 27 | +$(TABLES): $(TABLES_DIR)/%: $(FIXED_HTMLS_DIR)/%.html |
| 28 | + python ./extract_tables.py $^ --outdir $@ |
| 29 | + |
| 30 | +$(FIXED_HTMLS): $(FIXED_HTMLS_DIR)/%: $(HTMLS_DIR)/% |
| 31 | + ./clean_html.sh $^ $@ |
24 | 32 |
|
25 | 33 | $(HTMLS): $(HTMLS_DIR)/%.html: $(UNPACKED_DIR)/%
|
26 |
| - ./docker-latex2html.sh $(HTMLS_DIR) $^ |
| 34 | + ./docker-latex2html.sh $^ $@ |
27 | 35 |
|
28 | 36 | unpack_all: $(UNPACKS)
|
29 | 37 |
|
30 |
| -$(UNPACKS): $(UNPACKED_DIR)/%: $(ARCHIVES_DIR)/% |
| 38 | +$(UNPACKS): $(UNPACKED_DIR)/%: $(ARCHIVES_DIR)/%.gz |
31 | 39 | ./unpack-sources.sh $^ $@
|
32 | 40 |
|
33 | 41 | $(ANNOTATIONS_DIR)/pdfs-urls.csv: $(ANNOTATIONS_DIR)/papers-urls.csv
|
|
0 commit comments