Skip to content

Commit c60f382

Browse files
committed
Update documentation
* merge tables and papers dirs into one * add cells gold tags legend
1 parent f26bdfc commit c60f382

File tree

5 files changed

+381
-84
lines changed

5 files changed

+381
-84
lines changed

Makefile

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -5,15 +5,15 @@ ARCHIVES_DIR = $(ARXIV_DIR)/sources
55
UNPACKED_DIR = $(ARXIV_DIR)/unpacked_sources
66
HTMLS_DIR = $(ARXIV_DIR)/htmls
77
FIXED_HTMLS_DIR = $(ARXIV_DIR)/htmls-clean
8-
TABLES_DIR = $(ARXIV_DIR)/tables
9-
TEXTS_DIR = $(ARXIV_DIR)/texts
8+
TABLES_DIR = $(ARXIV_DIR)/papers
9+
TEXTS_DIR = $(ARXIV_DIR)/papers
1010

1111
ARCHIVES := $(shell find $(ARCHIVES_DIR) -name '*.gz' -type f 2>/dev/null)
1212
UNPACKS := $(patsubst $(ARCHIVES_DIR)/%.gz,$(UNPACKED_DIR)/%,$(ARCHIVES))
1313
HTMLS := $(patsubst $(ARCHIVES_DIR)/%.gz,$(HTMLS_DIR)/%.html,$(ARCHIVES))
1414
FIXED_HTMLS := $(patsubst $(ARCHIVES_DIR)/%.gz,$(FIXED_HTMLS_DIR)/%.html,$(ARCHIVES))
1515
TABLES := $(patsubst $(ARCHIVES_DIR)/%.gz,$(TABLES_DIR)/%,$(ARCHIVES))
16-
TEXTS := $(patsubst $(ARCHIVES_DIR)/%.gz,$(TEXTS_DIR)/%.json,$(ARCHIVES))
16+
TEXTS := $(patsubst $(ARCHIVES_DIR)/%.gz,$(TEXTS_DIR)/%/text.json,$(ARCHIVES))
1717

1818
.PHONY: all
1919
all: $(ANNOTATIONS_DIR)/pdfs-urls.csv $(ANNOTATIONS_DIR)/sources-urls.csv extract_all
@@ -34,7 +34,7 @@ extract_all: extract_tables extract_texts
3434

3535
extract_texts: $(TEXTS)
3636

37-
$(TEXTS): $(TEXTS_DIR)/%.json: $(FIXED_HTMLS_DIR)/%.html
37+
$(TEXTS): $(TEXTS_DIR)/%/text.json: $(FIXED_HTMLS_DIR)/%.html
3838
python ./extract_texts.py $^ $@
3939

4040

environment.yml

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,4 @@ dependencies:
1515
- Unidecode=1.0.23
1616
- elasticsearch-dsl=7.0.0
1717
- ipython=7.5.0
18-
- tqdm=4.28.1
1918
- joblib=0.13.2
20-
- fastprogress=0.1.20

0 commit comments

Comments
 (0)