Skip to content

Commit b40eefe

Browse files
author
Marcin Kardas
committed
Extraction pipeline fixes
* force BeautifulSoup to use utf-8 for table extraction * use inline table style * use find to allow for nested directories with papers
1 parent 5cb2a40 commit b40eefe

File tree

7 files changed

+22
-19
lines changed

7 files changed

+22
-19
lines changed

Makefile

Lines changed: 12 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
DATA_DIR = ../data
1+
DATA_DIR = data
22
ANNOTATIONS_DIR = $(DATA_DIR)/annotations
33
ARXIV_DIR = $(DATA_DIR)/arxiv
44
ARCHIVES_DIR = $(ARXIV_DIR)/sources
@@ -8,26 +8,25 @@ FIXED_HTMLS_DIR = $(ARXIV_DIR)/htmls-clean
88
TABLES_DIR = $(ARXIV_DIR)/tables
99
TEXTS_DIR = $(ARXIV_DIR)/texts
1010

11-
ARCHIVES = $(wildcard $(ARCHIVES_DIR)/**.gz)
12-
UNPACKS = $(patsubst $(ARCHIVES_DIR)/%.gz,$(UNPACKED_DIR)/%,$(ARCHIVES))
13-
HTMLS = $(patsubst $(ARCHIVES_DIR)/%.gz,$(HTMLS_DIR)/%.html,$(ARCHIVES))
14-
FIXED_HTMLS = $(patsubst $(ARCHIVES_DIR)/%.gz,$(FIXED_HTMLS_DIR)/%.html,$(ARCHIVES))
15-
TABLES = $(patsubst $(ARCHIVES_DIR)/%.gz,$(TABLES_DIR)/%,$(ARCHIVES))
16-
TEXTS = $(patsubst $(ARCHIVES_DIR)/%.gz,$(TEXTS_DIR)/%.json,$(ARCHIVES))
17-
18-
$(shell mkdir -p "$(DATA_DIR)" "$(ANNOTATIONS_DIR)" "$(UNPACKED_DIR)" "$(HTMLS_DIR)" "$(FIXED_HTMLS_DIR)" "$(TABLES_DIR)" "$(TEXTS_DIR)")
11+
ARCHIVES := $(shell find $(ARCHIVES_DIR) -name '*.gz' -type f 2>/dev/null)
12+
UNPACKS := $(patsubst $(ARCHIVES_DIR)/%.gz,$(UNPACKED_DIR)/%,$(ARCHIVES))
13+
HTMLS := $(patsubst $(ARCHIVES_DIR)/%.gz,$(HTMLS_DIR)/%.html,$(ARCHIVES))
14+
FIXED_HTMLS := $(patsubst $(ARCHIVES_DIR)/%.gz,$(FIXED_HTMLS_DIR)/%.html,$(ARCHIVES))
15+
TABLES := $(patsubst $(ARCHIVES_DIR)/%.gz,$(TABLES_DIR)/%,$(ARCHIVES))
16+
TEXTS := $(patsubst $(ARCHIVES_DIR)/%.gz,$(TEXTS_DIR)/%.json,$(ARCHIVES))
1917

2018
.PHONY: all
2119
all: $(ANNOTATIONS_DIR)/pdfs-urls.csv $(ANNOTATIONS_DIR)/sources-urls.csv extract_all
2220

2321
.PHONY: test
2422
test: DATA_DIR = test/data
23+
test: TABLE_FILE = $(TABLES_DIR)/paper/table_01.csv
2524
test:
2625
mkdir -p $(ARCHIVES_DIR)
2726
tar czf $(ARCHIVES_DIR)/paper.gz -C test/src .
28-
$(MAKE) DATA_DIR=$(DATA_DIR) extract_all
29-
cat $(TABLES_DIR)/paper/table_01.csv
30-
diff $(TABLES_DIR)/paper/table_01.csv test/src/table_01.csv
27+
$(MAKE) DATA_DIR=$(DATA_DIR) --always-make extract_all
28+
cat $(TABLE_FILE)
29+
diff $(TABLE_FILE) test/src/table_01.csv
3130

3231
.PHONY: extract_all extract_texts extract_tables fix_htmls_all convert_all unpack_all
3332

@@ -72,6 +71,7 @@ $(ANNOTATIONS_DIR)/%: $(ANNOTATIONS_DIR)/%.gz
7271
gunzip -kf $^
7372

7473
$(ANNOTATIONS_DIR)/evaluation-tables.json.gz:
74+
$(shell mkdir -p "$(ANNOTATIONS_DIR)")
7575
wget https://paperswithcode.com/media/about/evaluation-tables.json.gz -O $@
7676

7777

README.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -26,7 +26,7 @@ conda env create -f environment.yml
2626
source activate xtables
2727
make -j 8 -i extract_all > stdout.log 2> stderr.log
2828
```
29-
where `8` is number of jobs to run simultaneously.
29+
where `8` is number of jobs to run simultaneously. Optionally one can specify path to data directory, f.e., `make DATA_DIR=mydata ...`.
3030

3131
## Test
3232
To test the whole extraction on a single file run

extract_tables.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -116,7 +116,7 @@ def extract_tables(filename, outdir):
116116
html = f.read()
117117
outdir = Path(outdir)
118118
outdir.mkdir(parents=True, exist_ok=True)
119-
soup = BeautifulSoup(html, "lxml")
119+
soup = BeautifulSoup(html, "lxml", from_encoding="utf-8")
120120
flatten_tables(soup)
121121
set_ids_by_labels(soup)
122122
tables = soup.select("div.tabular")

sota_extractor2/data/elastic.py

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,5 @@
11
import pandas as pd
22
import re
3-
from bs4 import BeautifulSoup
43

54
from elasticsearch_dsl import Document, Boolean, Object, \
65
analyzer, InnerDoc, Keyword, Text, Integer, tokenizer, token_filter
@@ -141,7 +140,7 @@ class Index:
141140

142141
def to_json(self):
143142
data = self.to_dict()
144-
return serializer.dumps(d)
143+
return serializer.dumps(data)
145144

146145
@classmethod
147146
def from_json(cls, json, paper_id=None):

sota_extractor2/data/paper_collection.py

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -39,7 +39,6 @@ def __init__(self, path, load_texts=True, load_tables=True):
3939
outer_join = set(texts).union(set(tables))
4040

4141
self._papers = {k: Paper(texts.get(k), tables.get(k), annotations.get(k)) for k in outer_join}
42-
self.annotations = annotations
4342

4443
def __len__(self):
4544
return len(self._papers)

sota_extractor2/helpers/jupyter.py

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
from IPython.core.display import display, HTML
2-
2+
from .table_style import table_style
33
def set_seed(seed, name):
44
import torch
55
import numpy as np
@@ -11,6 +11,8 @@ def set_seed(seed, name):
1111

1212
def display_html(s): return display(HTML(s))
1313

14+
15+
1416
def display_table(table, structure=None):
1517
"""
1618
matrix - 2d ndarray with cell values
@@ -22,7 +24,7 @@ def display_table(table, structure=None):
2224
matrix = table
2325
if structure is None: structure = table.matrix_gold_tags
2426
html = []
25-
html.append('<link href="http://10.0.1.145:8001/static/css/main.bd3d2d63.chunk.css" rel="stylesheet">')
27+
html.append(table_style)
2628
html.append('<div class="tableWrapper">')
2729
html.append("<table>")
2830
for row,struc_row in zip(matrix, structure):
Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,3 @@
1+
table_style="""<style>body{margin:0;padding:0;font-family:-apple-system,BlinkMacSystemFont,Segoe UI,Roboto,Oxygen,Ubuntu,Cantarell,Fira Sans,Droid Sans,Helvetica Neue,sans-serif;-webkit-font-smoothing:antialiased;-moz-osx-font-smoothing:grayscale}code{font-family:source-code-pro,Menlo,Monaco,Consolas,Courier New,monospace}.tableWrapper{-overflow:auto}.tableWrapper .model-params{background-color:#209cee;color:rgba(0,0,0,.7)}.tableWrapper .table-meta{background-color:#fff3c5;color:rgba(0,0,0,.7)}.tableWrapper .model-best{background-color:#ff3860;color:rgba(0,0,0,.7)}.tableWrapper .model-competing{background-color:#ffdd57;color:rgba(0,0,0,.7)}.tableWrapper .model-paper{background-color:#ff3860;color:#fff}.tableWrapper .dataset-sub{background-color:#23d160;color:#fff}.tableWrapper .dataset-metric{background-color:#209cee;color:#fff}.tableWrapper .dataset{background-color:#02bd43;color:#fff}.tableWrapper .trash{background-color:#363636;color:#f5f5f5}.tableWrapper .wtf{background-color:#f0f;color:#f5f5f5}.tableWrapper .dataset-task{background-color:#77ecdd;color:rgba(0,0,0,.7)}.tableWrapper .dataset-paper{background-color:#e4ffee;color:rgba(0,0,0,.7)}.tableWrapper td.focused-cell{outline:2px solid #9ecaed;border-radius:7px;box-shadow:0 0 10px #9ecaed}div.form-group>input.form-control.input-sm{border-radius:2px;font-size:.75rem;background-color:#fff;color:#363636;box-shadow:inset 0 1px 2px rgba(10,10,10,.1);max-width:100%;width:100%;height:2.25em;padding:calc(.375em - 1px) calc(.625em - 1px);position:relative;border:1px solid #b5b5b5}div.form-group>input.form-control.input-sm:focus{border-color:#3273dc;box-shadow:0 0 0 .125em rgba(50,115,220,.25)}
2+
</style>
3+
"""

0 commit comments

Comments
 (0)