Skip to content

Commit 3df4648

Browse files
committed
Make pipeline more robust
* add temporary paper type to automatically delete a paper from elastic * add unpack step * add custom exceptions
1 parent 1e33103 commit 3df4648

File tree

12 files changed

+131
-32
lines changed

12 files changed

+131
-32
lines changed

environment.yml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -16,3 +16,4 @@ dependencies:
1616
- elasticsearch-dsl=7.0.0
1717
- ipython=7.5.0
1818
- joblib=0.13.2
19+
- python-magic=0.4.15

extract_tables.py

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -335,9 +335,7 @@ def remove_footnotes(soup):
335335
elem.extract()
336336

337337

338-
def extract_tables(filename):
339-
with open(filename, "rb") as f:
340-
html = f.read()
338+
def extract_tables(html):
341339
soup = BeautifulSoup(html, "lxml", from_encoding="utf-8")
342340
set_ids_by_labels(soup)
343341
fix_span_tables(soup)
@@ -377,7 +375,9 @@ def extract_tables(filename):
377375
return data
378376

379377
def extract_tables_cmd(filename, outdir):
380-
tables = extract_tables(filename)
378+
with open(filename, "rb") as f:
379+
html = f.read()
380+
tables = extract_tables(html)
381381
outdir = Path(outdir)
382382
outdir.mkdir(parents=True, exist_ok=True)
383383
save_tables(tables, outdir)

sota_extractor2/data/elastic.py

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,4 @@
1+
from bs4 import BeautifulSoup
12
import pandas as pd
23
import re
34

@@ -260,6 +261,11 @@ def print_section(self, name, clean_up=lambda x: x):
260261
def read_html(cls, file):
261262
return read_html(file)
262263

264+
@classmethod
265+
def from_html(cls, html, paper_id):
266+
soup = BeautifulSoup(html, "html.parser")
267+
return cls.parse_html(soup, paper_id)
268+
263269
@classmethod
264270
def parse_paper(cls, file, paper_id=None):
265271
file = Path(file)

sota_extractor2/data/structure.py

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -165,4 +165,7 @@ def __init__(self):
165165
setup_default_connection()
166166

167167
def __call__(self, paper, tables, paper_limit=30, corpus_limit=10):
168-
return pd.concat([evidence_for_table(paper, table, paper_limit, corpus_limit) for table in tables])
168+
dfs = [evidence_for_table(paper, table, paper_limit, corpus_limit) for table in tables]
169+
if len(dfs):
170+
return pd.concat(dfs)
171+
return pd.DataFrame()

sota_extractor2/errors.py

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,10 @@
1+
class PipelineError(Exception):
2+
pass
3+
4+
5+
class UnpackError(PipelineError):
6+
pass
7+
8+
9+
class LatexConversionError(PipelineError):
10+
pass

sota_extractor2/helpers/__init__.py

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,5 @@
1+
from .temp_paper import TempPaper
2+
from .latex_converter import LatexConverter
3+
from .unpack import Unpack
4+
5+
__all__ = ["TempPaper", "LatexConverter", "Unpack"]
Lines changed: 35 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -1,19 +1,27 @@
11
import docker
2+
from docker.errors import ContainerError, ImageNotFound
23
from pathlib import Path
4+
from tempfile import TemporaryDirectory
5+
6+
from sota_extractor2.errors import LatexConversionError
7+
38

49
def ro_bind(path): return dict(bind=path, mode='ro')
5-
def rw_bind(path): return dict(bind=path, mode='rw')
610

711

12+
def rw_bind(path): return dict(bind=path, mode='rw')
13+
814

915
class LatexConverter:
1016
def __init__(self, base_path):
1117
# pull arxivvanity/engrafo image
1218
self.client = docker.from_env()
1319
self.base_path = Path(base_path)
1420

15-
def to_html(self, source_dir, output_dir):
21+
def latex2html(self, source_dir, output_dir):
1622
base = self.base_path
23+
source_dir = Path(source_dir)
24+
output_dir = Path(output_dir)
1725
volumes = {
1826
base / "latex2html.sh": ro_bind("/files/latex2html.sh"),
1927
base / "guess_main.py": ro_bind("/files/guess_main.py"), # todo: run guess_main outside of docker
@@ -25,6 +33,28 @@ def to_html(self, source_dir, output_dir):
2533
output_dir.mkdir(parents=True, exist_ok=True)
2634
filename = "index.html"
2735
command = ["/files/latex2html.sh", filename]
28-
self.client.containers.run("arxivvanity/engrafo", command, remove=True,
29-
volumes=volumes) # todo: check if command as a list protects from shell injection
30-
# todo: check for errors
36+
self.client.containers.run("arxivvanity/engrafo", command, remove=True, volumes=volumes)
37+
38+
# todo: check for errors
39+
40+
def clean_html(self, path):
41+
path = Path(path)
42+
volumes = {
43+
path.resolve(): ro_bind("/files/index.html"),
44+
}
45+
46+
command = "timeout -t 20 -s KILL chromium-browser --headless" \
47+
" --disable-gpu --disable-software-rasterizer --no-sandbox" \
48+
" --timeout=30000 --dump-dom /files/index.html"
49+
data = self.client.containers.run("zenika/alpine-chrome:73", command, remove=True, entrypoint="",
50+
volumes=volumes)
51+
return data.decode('utf-8')
52+
53+
def to_html(self, source_dir):
54+
with TemporaryDirectory() as output_dir:
55+
output_dir = Path(output_dir)
56+
try:
57+
self.latex2html(source_dir, output_dir)
58+
return self.clean_html(output_dir / "index.html")
59+
except ContainerError as err:
60+
raise LatexConversionError from err

sota_extractor2/helpers/temp_paper.py

Lines changed: 15 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -4,12 +4,23 @@
44
import string
55
import random
66

7+
78
# todo: make sure multithreading/processing won't cause collisions
89
def random_id():
910
return "temp_" + ''.join(random.choice(string.ascii_lowercase) for i in range(10))
1011

1112

12-
def temp_paper(path):
13-
text = PaperText.parse_paper(path, random_id())
14-
tables = extract_tables(path)
15-
return Paper(paper_id=text.meta['id'], text=text, tables=tables, annotations=None)
13+
class TempPaper(Paper):
14+
"""Similar to Paper, but can be used as context manager, temporarily saving the paper to elastic"""
15+
def __init__(self, html):
16+
paper_id = random_id()
17+
text = PaperText.from_html(html, paper_id)
18+
tables = extract_tables(html)
19+
super().__init__(paper_id=paper_id, text=text, tables=tables, annotations=None)
20+
21+
def __enter__(self):
22+
self.text.save()
23+
return self
24+
25+
def __exit__(self, exc, value, tb):
26+
self.text.delete()

sota_extractor2/helpers/unpack.py

Lines changed: 26 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,26 @@
1+
from magic import Magic
2+
import tarfile
3+
import gzip
4+
from pathlib import Path
5+
from shutil import copyfileobj
6+
from sota_extractor2.errors import UnpackError
7+
8+
9+
class Unpack:
10+
def __init__(self):
11+
self.magic = Magic(mime=True, uncompress=True)
12+
13+
def __call__(self, source, dest):
14+
source = Path(source)
15+
dest = Path(dest)
16+
mime = self.magic.from_file(str(source))
17+
if mime == 'application/x-tar':
18+
dest.mkdir(parents=True, exist_ok=True)
19+
with tarfile.open(source, "r:*") as tar:
20+
tar.extractall(dest)
21+
elif mime == 'text/x-tex':
22+
dest.mkdir(parents=True, exist_ok=True)
23+
with gzip.open(source, "rb") as src, open(dest / "main.tex") as dst:
24+
copyfileobj(src, dst)
25+
else:
26+
raise UnpackError(f"Cannot unpack file of type {mime}")

sota_extractor2/models/linking/context_search.py

Lines changed: 9 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -158,8 +158,8 @@ def compute_context_logprobs(self, context, noise, logprobs):
158158
dss = set(find_datasets(context)) | set(abbrvs.keys())
159159
mss = set(find_metrics(context))
160160
dss -= mss
161-
print("dss", dss)
162-
print("mss", mss)
161+
###print("dss", dss)
162+
###print("mss", mss)
163163
self.compute_logprobs(dss, mss, abbrvs, noise, logprobs)
164164

165165
def match(self, contexts):
@@ -177,9 +177,9 @@ def __call__(self, query, datasets, caption, debug_info=None):
177177
cell = debug_info.cell
178178
key = (datasets, caption, query)
179179
cellstr = f"{cell.table_ext_id}/{cell.row}.{cell.col}"
180-
print(f"[DEBUG] {cellstr}")
181-
print("[DEBUG]", debug_info)
182-
print("query:", query, caption)
180+
###print(f"[DEBUG] {cellstr}")
181+
###print("[DEBUG]", debug_info)
182+
###print("query:", query, caption)
183183
if key in self.queries:
184184
# print(self.queries[key])
185185
for context in key:
@@ -188,10 +188,10 @@ def __call__(self, query, datasets, caption, debug_info=None):
188188
dss = set(find_datasets(context)) | set(abbrvs.keys())
189189
mss = set(find_metrics(context))
190190
dss -= mss
191-
print("dss", dss)
192-
print("mss", mss)
191+
###print("dss", dss)
192+
###print("mss", mss)
193193

194-
print("Taking result from cache")
194+
###print("Taking result from cache")
195195
p = self.queries[key]
196196
else:
197197
dist = self.match(key)
@@ -212,7 +212,7 @@ def __call__(self, query, datasets, caption, debug_info=None):
212212

213213
self.queries[key] = p
214214

215-
print(p)
215+
###print(p)
216216

217217
# error analysis only
218218
if self.debug_gold_df is not None:

0 commit comments

Comments
 (0)