Skip to content

Commit 4852ff6

Browse files
committed
Add text extraction to Makefile
1 parent 429ddf1 commit 4852ff6

File tree

8 files changed

+59
-12
lines changed

8 files changed

+59
-12
lines changed

Makefile

Lines changed: 14 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -6,14 +6,16 @@ UNPACKED_DIR = $(ARXIV_DIR)/unpacked_sources
66
HTMLS_DIR = $(ARXIV_DIR)/htmls
77
FIXED_HTMLS_DIR = $(ARXIV_DIR)/htmls-clean
88
TABLES_DIR = $(ARXIV_DIR)/tables
9+
TEXTS_DIR = $(ARXIV_DIR)/texts
910

1011
ARCHIVES = $(wildcard $(ARCHIVES_DIR)/**.gz)
1112
UNPACKS = $(patsubst $(ARCHIVES_DIR)/%.gz,$(UNPACKED_DIR)/%,$(ARCHIVES))
1213
HTMLS = $(patsubst $(ARCHIVES_DIR)/%.gz,$(HTMLS_DIR)/%.html,$(ARCHIVES))
1314
FIXED_HTMLS = $(patsubst $(ARCHIVES_DIR)/%.gz,$(FIXED_HTMLS_DIR)/%.html,$(ARCHIVES))
1415
TABLES = $(patsubst $(ARCHIVES_DIR)/%.gz,$(TABLES_DIR)/%,$(ARCHIVES))
16+
TEXTS = $(patsubst $(ARCHIVES_DIR)/%.gz,$(TEXTS_DIR)/%.json,$(ARCHIVES))
1517

16-
$(shell mkdir -p "$(DATA_DIR)" "$(ANNOTATIONS_DIR)" "$(UNPACKED_DIR)" "$(HTMLS_DIR)" "$(FIXED_HTMLS_DIR)" "$(TABLES_DIR)")
18+
$(shell mkdir -p "$(DATA_DIR)" "$(ANNOTATIONS_DIR)" "$(UNPACKED_DIR)" "$(HTMLS_DIR)" "$(FIXED_HTMLS_DIR)" "$(TABLES_DIR)" "$(TEXTS_DIR)")
1719

1820
.PHONY: all
1921
all: $(ANNOTATIONS_DIR)/pdfs-urls.csv $(ANNOTATIONS_DIR)/sources-urls.csv extract_all
@@ -27,7 +29,17 @@ test:
2729
cat $(TABLES_DIR)/paper/table_01.csv
2830
diff $(TABLES_DIR)/paper/table_01.csv test/src/table_01.csv
2931

30-
extract_all: $(TABLES)
32+
.PHONY: extract_all extract_texts extract_tables fix_htmls_all convert_all unpack_all
33+
34+
extract_all: extract_tables extract_texts
35+
36+
extract_texts: $(TEXTS)
37+
38+
$(TEXTS): $(TEXTS_DIR)/%.json: $(FIXED_HTMLS_DIR)/%.html
39+
python ./extract_texts.py $^ $@
40+
41+
42+
extract_tables: $(TABLES)
3143

3244
fix_htmls_all: $(FIXED_HTMLS)
3345

environment.yml

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -13,3 +13,5 @@ dependencies:
1313
- python=3.7.1
1414
- pyahocorasick=1.4.0
1515
- Unidecode=1.0.23
16+
- elasticsearch-dsl=7.0.0
17+
- ipython=7.5.0

extract_texts.py

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,15 @@
1+
import fire
2+
from sota_extractor2.data.elastic import Paper
3+
from pathlib import Path
4+
5+
def extract_text(source, target):
6+
source = Path(source)
7+
target = Path(target)
8+
target.parent.mkdir(exist_ok=True, parents=True)
9+
10+
arxiv_id = source.stem
11+
doc = Paper.parse_paper(source)
12+
with open(target, 'wt') as f:
13+
f.write(doc.to_json())
14+
15+
if __name__ == "__main__": fire.Fire(extract_text)

latex2html.sh

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,7 @@ cp -r "$RO_SOURCE_DIR" "$SOURCE_DIR"
99
cd "$SOURCE_DIR"
1010
MAINTEX=$(find . -type f -iname "*.tex" -print0 | xargs -0 grep -l documentclass | head -1)
1111
echo $MAINTEX
12-
timeout -s KILL 60 htlatex "$MAINTEX" '' '' '' '-interaction=nonstopmode'
12+
timeout -s KILL 60 htlatex "$MAINTEX" '' '' '' '-interaction=batchmode'
1313

1414
FILENAME=$(basename $MAINTEX)
1515
FILENAME="${FILENAME%.tex}.html"

normalize_references.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -66,9 +66,9 @@ def resolve_references_in_html(args):
6666
update_references(html, mapping)
6767
save_html(output, html)
6868

69-
DUMP_REFERENCES_PATH = Path("/home/ubuntu/pwc/mycache/references-short.json")
69+
#DUMP_REFERENCES_PATH = Path("/home/ubuntu/pwc/mycache/references-short.json")
7070

71-
TRIE_PATH = Path("/home/ubuntu/pwc/mycache/automaton.pkl")
71+
#TRIE_PATH = Path("/home/ubuntu/pwc/mycache/automaton.pkl")
7272

7373
def normalize_references(source_path, target_path, automaton, jobs=1):
7474
global reference_trie

sota_extractor2/data/elastic.py

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,7 @@
44

55
from elasticsearch_dsl import Document, Boolean, Object, \
66
analyzer, InnerDoc, Keyword, Text, Integer, tokenizer, token_filter
7+
from elasticsearch_dsl.serializer import serializer
78

89
from IPython.display import display, Markdown
910

@@ -123,6 +124,9 @@ class Paper(Document):
123124
class Index:
124125
name = 'papers'
125126

127+
def to_json(self):
128+
return serializer.dumps(self.to_dict())
129+
126130
def to_df(self):
127131
return pd.DataFrame({'header': [f.header for f in self.fragments],
128132
'text': [f.text for f in self.fragments],

test/.gitignore

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
data/

test/src/main.tex

Lines changed: 20 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -1,11 +1,24 @@
11
\documentclass{article}
22
\usepackage{booktabs}
3+
\title{DILBERT: Distilling Inner Latent BERT variables}
4+
\author{John Doe}
35
\begin{document}
4-
\begin{tabular}{lcr} \toprule
5-
left & center & right\\\midrule
6-
1 & 2 & 3\\
7-
4 & 5 & 6\\\midrule
8-
7 & 8 & 9\\
9-
a & b & c\\\bottomrule
10-
\end{tabular}
6+
\maketitle
7+
\begin{abstract}
8+
In this paper we achieve state-of-the-art performance in random number generation.
9+
\end{abstract}
10+
\section{Introduction}
11+
\section{Experiments}
12+
In this section we present Table~\ref{tab}.
13+
\begin{table}
14+
\begin{tabular}{lcr} \toprule
15+
left & center & right\\\midrule
16+
1 & 2 & 3\\
17+
4 & 5 & 6\\\midrule
18+
7 & 8 & 9\\
19+
a & b & c\\\bottomrule
20+
\end{tabular}
21+
\caption{A table.}
22+
\label{tab}
23+
\end{table}
1124
\end{document}

0 commit comments

Comments
 (0)