Extraction pipeline fixes

Marcin Kardas · Marcin Kardas · commit b40eefea7d7c · 2019-07-04T00:02:45.000+02:00
* force BeautifulSoup to use utf-8 for table extraction
* use inline table style
* use find to allow for nested directories with papers
diff --git a/Makefile b/Makefile
@@ -1,4 +1,4 @@
-DATA_DIR = ../data
+DATA_DIR = data
 ANNOTATIONS_DIR = $(DATA_DIR)/annotations
 ARXIV_DIR = $(DATA_DIR)/arxiv
 ARCHIVES_DIR = $(ARXIV_DIR)/sources
@@ -8,26 +8,25 @@ FIXED_HTMLS_DIR = $(ARXIV_DIR)/htmls-clean
 TABLES_DIR = $(ARXIV_DIR)/tables
 TEXTS_DIR = $(ARXIV_DIR)/texts
 
-ARCHIVES    = $(wildcard $(ARCHIVES_DIR)/**.gz)
-UNPACKS     = $(patsubst $(ARCHIVES_DIR)/%.gz,$(UNPACKED_DIR)/%,$(ARCHIVES))
-HTMLS       = $(patsubst $(ARCHIVES_DIR)/%.gz,$(HTMLS_DIR)/%.html,$(ARCHIVES))
-FIXED_HTMLS = $(patsubst $(ARCHIVES_DIR)/%.gz,$(FIXED_HTMLS_DIR)/%.html,$(ARCHIVES))
-TABLES      = $(patsubst $(ARCHIVES_DIR)/%.gz,$(TABLES_DIR)/%,$(ARCHIVES))
-TEXTS       = $(patsubst $(ARCHIVES_DIR)/%.gz,$(TEXTS_DIR)/%.json,$(ARCHIVES))
-
-$(shell mkdir -p "$(DATA_DIR)" "$(ANNOTATIONS_DIR)" "$(UNPACKED_DIR)" "$(HTMLS_DIR)" "$(FIXED_HTMLS_DIR)" "$(TABLES_DIR)" "$(TEXTS_DIR)")
+ARCHIVES    := $(shell find $(ARCHIVES_DIR) -name '*.gz' -type f 2>/dev/null)
+UNPACKS     := $(patsubst $(ARCHIVES_DIR)/%.gz,$(UNPACKED_DIR)/%,$(ARCHIVES))
+HTMLS       := $(patsubst $(ARCHIVES_DIR)/%.gz,$(HTMLS_DIR)/%.html,$(ARCHIVES))
+FIXED_HTMLS := $(patsubst $(ARCHIVES_DIR)/%.gz,$(FIXED_HTMLS_DIR)/%.html,$(ARCHIVES))
+TABLES      := $(patsubst $(ARCHIVES_DIR)/%.gz,$(TABLES_DIR)/%,$(ARCHIVES))
+TEXTS       := $(patsubst $(ARCHIVES_DIR)/%.gz,$(TEXTS_DIR)/%.json,$(ARCHIVES))
 
 .PHONY: all
 all:	$(ANNOTATIONS_DIR)/pdfs-urls.csv $(ANNOTATIONS_DIR)/sources-urls.csv extract_all
 
 .PHONY: test
 test: DATA_DIR = test/data
+test: TABLE_FILE = $(TABLES_DIR)/paper/table_01.csv
 test:
 	mkdir -p $(ARCHIVES_DIR)
 	tar czf $(ARCHIVES_DIR)/paper.gz -C test/src .
-	$(MAKE) DATA_DIR=$(DATA_DIR) extract_all
-	cat $(TABLES_DIR)/paper/table_01.csv
-	diff $(TABLES_DIR)/paper/table_01.csv test/src/table_01.csv
+	$(MAKE) DATA_DIR=$(DATA_DIR) --always-make extract_all
+	cat $(TABLE_FILE)
+	diff $(TABLE_FILE) test/src/table_01.csv
 
 .PHONY: extract_all extract_texts extract_tables fix_htmls_all convert_all unpack_all
 
@@ -72,6 +71,7 @@ $(ANNOTATIONS_DIR)/%:	$(ANNOTATIONS_DIR)/%.gz
 	gunzip -kf $^
 
 $(ANNOTATIONS_DIR)/evaluation-tables.json.gz:
+	$(shell mkdir -p "$(ANNOTATIONS_DIR)")
 	wget https://paperswithcode.com/media/about/evaluation-tables.json.gz -O $@
 
 
diff --git a/README.md b/README.md
@@ -26,7 +26,7 @@ conda env create -f environment.yml
 source activate xtables
 make -j 8 -i extract_all > stdout.log 2> stderr.log
 ```
-where `8` is number of jobs to run simultaneously.
+where `8` is number of jobs to run simultaneously. Optionally one can specify path to data directory, f.e., `make DATA_DIR=mydata ...`.
 
 ## Test
 To test the whole extraction on a single file run
diff --git a/extract_tables.py b/extract_tables.py
@@ -116,7 +116,7 @@ def extract_tables(filename, outdir):
         html = f.read()
     outdir = Path(outdir)
     outdir.mkdir(parents=True, exist_ok=True)
-    soup = BeautifulSoup(html, "lxml")
+    soup = BeautifulSoup(html, "lxml", from_encoding="utf-8")
     flatten_tables(soup)
     set_ids_by_labels(soup)
     tables = soup.select("div.tabular")
diff --git a/sota_extractor2/data/elastic.py b/sota_extractor2/data/elastic.py
@@ -1,6 +1,5 @@
 import pandas as pd
 import re
-from bs4 import BeautifulSoup
 
 from elasticsearch_dsl import Document, Boolean, Object, \
     analyzer, InnerDoc, Keyword, Text, Integer, tokenizer, token_filter
@@ -141,7 +140,7 @@ class Index:
 
     def to_json(self):
         data = self.to_dict()
-        return serializer.dumps(d)
+        return serializer.dumps(data)
 
     @classmethod
     def from_json(cls, json, paper_id=None):
diff --git a/sota_extractor2/data/paper_collection.py b/sota_extractor2/data/paper_collection.py
@@ -39,7 +39,6 @@ def __init__(self, path, load_texts=True, load_tables=True):
         outer_join = set(texts).union(set(tables))
 
         self._papers = {k: Paper(texts.get(k), tables.get(k), annotations.get(k)) for k in outer_join}
-        self.annotations = annotations
 
     def __len__(self):
         return len(self._papers)
diff --git a/sota_extractor2/helpers/jupyter.py b/sota_extractor2/helpers/jupyter.py
@@ -1,5 +1,5 @@
 from IPython.core.display import display, HTML
-
+from .table_style import table_style
 def set_seed(seed, name):
     import torch
     import numpy as np
@@ -11,6 +11,8 @@ def set_seed(seed, name):
 
 def display_html(s): return display(HTML(s))
 
+
+
 def display_table(table, structure=None):
     """
         matrix - 2d ndarray with cell values
@@ -22,7 +24,7 @@ def display_table(table, structure=None):
         matrix = table
     if structure is None: structure = table.matrix_gold_tags
     html = []
-    html.append('<link href="http://10.0.1.145:8001/static/css/main.bd3d2d63.chunk.css" rel="stylesheet">')
+    html.append(table_style)
     html.append('<div class="tableWrapper">')
     html.append("<table>")
     for row,struc_row in zip(matrix, structure):
diff --git a/sota_extractor2/helpers/table_style.py b/sota_extractor2/helpers/table_style.py
@@ -0,0 +1,3 @@
+table_style="""<style>body{margin:0;padding:0;font-family:-apple-system,BlinkMacSystemFont,Segoe UI,Roboto,Oxygen,Ubuntu,Cantarell,Fira Sans,Droid Sans,Helvetica Neue,sans-serif;-webkit-font-smoothing:antialiased;-moz-osx-font-smoothing:grayscale}code{font-family:source-code-pro,Menlo,Monaco,Consolas,Courier New,monospace}.tableWrapper{-overflow:auto}.tableWrapper .model-params{background-color:#209cee;color:rgba(0,0,0,.7)}.tableWrapper .table-meta{background-color:#fff3c5;color:rgba(0,0,0,.7)}.tableWrapper .model-best{background-color:#ff3860;color:rgba(0,0,0,.7)}.tableWrapper .model-competing{background-color:#ffdd57;color:rgba(0,0,0,.7)}.tableWrapper .model-paper{background-color:#ff3860;color:#fff}.tableWrapper .dataset-sub{background-color:#23d160;color:#fff}.tableWrapper .dataset-metric{background-color:#209cee;color:#fff}.tableWrapper .dataset{background-color:#02bd43;color:#fff}.tableWrapper .trash{background-color:#363636;color:#f5f5f5}.tableWrapper .wtf{background-color:#f0f;color:#f5f5f5}.tableWrapper .dataset-task{background-color:#77ecdd;color:rgba(0,0,0,.7)}.tableWrapper .dataset-paper{background-color:#e4ffee;color:rgba(0,0,0,.7)}.tableWrapper td.focused-cell{outline:2px solid #9ecaed;border-radius:7px;box-shadow:0 0 10px #9ecaed}div.form-group>input.form-control.input-sm{border-radius:2px;font-size:.75rem;background-color:#fff;color:#363636;box-shadow:inset 0 1px 2px rgba(10,10,10,.1);max-width:100%;width:100%;height:2.25em;padding:calc(.375em - 1px) calc(.625em - 1px);position:relative;border:1px solid #b5b5b5}div.form-group>input.form-control.input-sm:focus{border-color:#3273dc;box-shadow:0 0 0 .125em rgba(50,115,220,.25)}
+</style>
+"""

Original file line number	Diff line number	Diff line change
`@@ -0,0 +1,3 @@`
	`1`	+table_style="""<style>body{margin:0;padding:0;font-family:-apple-system,BlinkMacSystemFont,Segoe UI,Roboto,Oxygen,Ubuntu,Cantarell,Fira Sans,Droid Sans,Helvetica Neue,sans-serif;-webkit-font-smoothing:antialiased;-moz-osx-font-smoothing:grayscale}code{font-family:source-code-pro,Menlo,Monaco,Consolas,Courier New,monospace}.tableWrapper{-overflow:auto}.tableWrapper .model-params{background-color:#209cee;color:rgba(0,0,0,.7)}.tableWrapper .table-meta{background-color:#fff3c5;color:rgba(0,0,0,.7)}.tableWrapper .model-best{background-color:#ff3860;color:rgba(0,0,0,.7)}.tableWrapper .model-competing{background-color:#ffdd57;color:rgba(0,0,0,.7)}.tableWrapper .model-paper{background-color:#ff3860;color:#fff}.tableWrapper .dataset-sub{background-color:#23d160;color:#fff}.tableWrapper .dataset-metric{background-color:#209cee;color:#fff}.tableWrapper .dataset{background-color:#02bd43;color:#fff}.tableWrapper .trash{background-color:#363636;color:#f5f5f5}.tableWrapper .wtf{background-color:#f0f;color:#f5f5f5}.tableWrapper .dataset-task{background-color:#77ecdd;color:rgba(0,0,0,.7)}.tableWrapper .dataset-paper{background-color:#e4ffee;color:rgba(0,0,0,.7)}.tableWrapper td.focused-cell{outline:2px solid #9ecaed;border-radius:7px;box-shadow:0 0 10px #9ecaed}div.form-group>input.form-control.input-sm{border-radius:2px;font-size:.75rem;background-color:#fff;color:#363636;box-shadow:inset 0 1px 2px rgba(10,10,10,.1);max-width:100%;width:100%;height:2.25em;padding:calc(.375em - 1px) calc(.625em - 1px);position:relative;border:1px solid #b5b5b5}div.form-group>input.form-control.input-sm:focus{border-color:#3273dc;box-shadow:0 0 0 .125em rgba(50,115,220,.25)}
	`2`	`+</style>`
	`3`	`+"""`