Add text extraction to Makefile

mkardas · mkardas · commit 4852ff634ea3 · 2019-07-01T15:36:11.000+02:00
diff --git a/Makefile b/Makefile
@@ -6,14 +6,16 @@ UNPACKED_DIR = $(ARXIV_DIR)/unpacked_sources
 HTMLS_DIR = $(ARXIV_DIR)/htmls
 FIXED_HTMLS_DIR = $(ARXIV_DIR)/htmls-clean
 TABLES_DIR = $(ARXIV_DIR)/tables
+TEXTS_DIR = $(ARXIV_DIR)/texts
 
 ARCHIVES    = $(wildcard $(ARCHIVES_DIR)/**.gz)
 UNPACKS     = $(patsubst $(ARCHIVES_DIR)/%.gz,$(UNPACKED_DIR)/%,$(ARCHIVES))
 HTMLS       = $(patsubst $(ARCHIVES_DIR)/%.gz,$(HTMLS_DIR)/%.html,$(ARCHIVES))
 FIXED_HTMLS = $(patsubst $(ARCHIVES_DIR)/%.gz,$(FIXED_HTMLS_DIR)/%.html,$(ARCHIVES))
 TABLES      = $(patsubst $(ARCHIVES_DIR)/%.gz,$(TABLES_DIR)/%,$(ARCHIVES))
+TEXTS       = $(patsubst $(ARCHIVES_DIR)/%.gz,$(TEXTS_DIR)/%.json,$(ARCHIVES))
 
-$(shell mkdir -p "$(DATA_DIR)" "$(ANNOTATIONS_DIR)" "$(UNPACKED_DIR)" "$(HTMLS_DIR)" "$(FIXED_HTMLS_DIR)" "$(TABLES_DIR)")
+$(shell mkdir -p "$(DATA_DIR)" "$(ANNOTATIONS_DIR)" "$(UNPACKED_DIR)" "$(HTMLS_DIR)" "$(FIXED_HTMLS_DIR)" "$(TABLES_DIR)" "$(TEXTS_DIR)")
 
 .PHONY: all
 all:	$(ANNOTATIONS_DIR)/pdfs-urls.csv $(ANNOTATIONS_DIR)/sources-urls.csv extract_all
@@ -27,7 +29,17 @@ test:
 	cat $(TABLES_DIR)/paper/table_01.csv
 	diff $(TABLES_DIR)/paper/table_01.csv test/src/table_01.csv
 
-extract_all: $(TABLES)
+.PHONY: extract_all extract_texts extract_tables fix_htmls_all convert_all unpack_all
+
+extract_all: extract_tables extract_texts
+
+extract_texts: $(TEXTS)
+
+$(TEXTS): $(TEXTS_DIR)/%.json: $(FIXED_HTMLS_DIR)/%.html
+	python ./extract_texts.py $^ $@
+
+
+extract_tables: $(TABLES)
 
 fix_htmls_all: $(FIXED_HTMLS)
 
diff --git a/environment.yml b/environment.yml
@@ -13,3 +13,5 @@ dependencies:
 - python=3.7.1
 - pyahocorasick=1.4.0
 - Unidecode=1.0.23
+- elasticsearch-dsl=7.0.0
+- ipython=7.5.0
diff --git a/extract_texts.py b/extract_texts.py
@@ -0,0 +1,15 @@
+import fire
+from sota_extractor2.data.elastic import Paper
+from pathlib import Path
+
+def extract_text(source, target):
+    source = Path(source)
+    target = Path(target)
+    target.parent.mkdir(exist_ok=True, parents=True)
+
+    arxiv_id = source.stem
+    doc = Paper.parse_paper(source)
+    with open(target, 'wt') as f:
+        f.write(doc.to_json())
+
+if __name__ == "__main__": fire.Fire(extract_text)
diff --git a/latex2html.sh b/latex2html.sh
@@ -9,7 +9,7 @@ cp -r "$RO_SOURCE_DIR" "$SOURCE_DIR"
 cd "$SOURCE_DIR"
 MAINTEX=$(find . -type f -iname "*.tex" -print0 | xargs -0 grep -l documentclass | head -1)
 echo $MAINTEX
-timeout -s KILL 60 htlatex "$MAINTEX" '' '' '' '-interaction=nonstopmode'
+timeout -s KILL 60 htlatex "$MAINTEX" '' '' '' '-interaction=batchmode'
 
 FILENAME=$(basename $MAINTEX)
 FILENAME="${FILENAME%.tex}.html"
diff --git a/normalize_references.py b/normalize_references.py
@@ -66,9 +66,9 @@ def resolve_references_in_html(args):
     update_references(html, mapping)
     save_html(output, html)
 
-DUMP_REFERENCES_PATH = Path("/home/ubuntu/pwc/mycache/references-short.json")
+#DUMP_REFERENCES_PATH = Path("/home/ubuntu/pwc/mycache/references-short.json")
 
-TRIE_PATH = Path("/home/ubuntu/pwc/mycache/automaton.pkl")
+#TRIE_PATH = Path("/home/ubuntu/pwc/mycache/automaton.pkl")
 
 def normalize_references(source_path, target_path, automaton, jobs=1):
     global reference_trie
diff --git a/sota_extractor2/data/elastic.py b/sota_extractor2/data/elastic.py
@@ -4,6 +4,7 @@
 
 from elasticsearch_dsl import Document, Boolean, Object, \
     analyzer, InnerDoc, Keyword, Text, Integer, tokenizer, token_filter
+from elasticsearch_dsl.serializer import serializer
 
 from IPython.display import display, Markdown
 
@@ -123,6 +124,9 @@ class Paper(Document):
     class Index:
         name = 'papers'
 
+    def to_json(self):
+        return serializer.dumps(self.to_dict())
+
     def to_df(self):
         return pd.DataFrame({'header': [f.header for f in self.fragments],
                              'text': [f.text for f in self.fragments],
diff --git a/test/.gitignore b/test/.gitignore
@@ -0,0 +1 @@
+data/
diff --git a/test/src/main.tex b/test/src/main.tex
@@ -1,11 +1,24 @@
 \documentclass{article}
 \usepackage{booktabs}
+\title{DILBERT: Distilling Inner Latent BERT variables}
+\author{John Doe}
 \begin{document}
-  \begin{tabular}{lcr} \toprule
-    left & center & right\\\midrule
-    1 & 2 & 3\\
-    4 & 5 & 6\\\midrule
-    7 & 8 & 9\\
-    a & b & c\\\bottomrule
-  \end{tabular}
+\maketitle
+\begin{abstract}
+  In this paper we achieve state-of-the-art performance in random number generation.
+\end{abstract}
+\section{Introduction}
+\section{Experiments}
+In this section we present Table~\ref{tab}.
+  \begin{table}
+    \begin{tabular}{lcr} \toprule
+      left & center & right\\\midrule
+      1 & 2 & 3\\
+      4 & 5 & 6\\\midrule
+      7 & 8 & 9\\
+      a & b & c\\\bottomrule
+    \end{tabular}
+    \caption{A table.}
+    \label{tab}
+  \end{table}
 \end{document}