Skip to content

Commit c2eac8a

Browse files
committed
Use latexml to convert tex to html
* use timeout command to limit chromium execution (--timeout param happens to be ineffective) * avoid splitting tables into head, body and foot by pandas.read_html (otherwise results in ill-shaped headers when some of <th>s use rowspan>1) * adapt table extraction to latexml output * extract table layouts (border, cell alignment, headers (autodetected by latexml), span tags) * adapt reference normalization script to latexml output * try to parse ms.tex, main.tex and 00_main.tex before using documentclass heuristic * add table layout support to paper collection api * match annotations of htlatex parsed tables
1 parent 2ad717e commit c2eac8a

File tree

7 files changed

+327
-86
lines changed

7 files changed

+327
-86
lines changed

clean_html.sh

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -4,4 +4,4 @@ SOURCE=$(realpath "$1")
44
mkdir -p $(dirname "$2")
55
OUTPUT=$(realpath "$2")
66

7-
docker run --rm -v "$SOURCE":/files/index.html:ro --entrypoint '' zenika/alpine-chrome:73 chromium-browser --headless --disable-gpu --disable-software-rasterizer --no-sandbox --timeout=30000 --dump-dom /files/index.html > "$OUTPUT"
7+
docker run --rm -v "$SOURCE":/files/index.html:ro --entrypoint '' zenika/alpine-chrome:73 timeout -t 20 -s KILL chromium-browser --headless --disable-gpu --disable-software-rasterizer --no-sandbox --timeout=30000 --dump-dom /files/index.html > "$OUTPUT"

extract_tables.py

Lines changed: 215 additions & 63 deletions
Original file line numberDiff line numberDiff line change
@@ -9,15 +9,43 @@
99
import json
1010
import re
1111
from ast import literal_eval
12+
from collections import OrderedDict
13+
from dataclasses import dataclass
14+
from typing import Set
1215

1316
from tabular import Tabular
1417

1518

19+
# begin of dirty hack
20+
# pandas parsing of html tables is really nice
21+
# but it has a lot of defaults that can't be
22+
# modified
23+
24+
# one of the defaults is forcing thead rows
25+
# into column names, ignoring value of `header`
26+
# param
27+
28+
# the second issue is parsing numerical-looking
29+
# values into floats
30+
31+
_old_data_to_frame = pd.io.html._data_to_frame
32+
def _new_data_to_frame(**kwargs):
33+
head, body, foot = kwargs.pop("data")
34+
if head:
35+
body = head + body
36+
if foot:
37+
body += foot
38+
return _old_data_to_frame(data=(None, body, None), **kwargs)
39+
pd.io.html._data_to_frame = _new_data_to_frame
40+
# end of dirty hack
41+
42+
43+
1644
def flatten_tables(soup):
17-
inners = soup.select("div.tabular table table")
45+
inners = soup.select(".ltx_tabular .ltx_tabular")
1846
for inner in inners:
1947
inner.name = 'div'
20-
for elem in inner.select("tr, td, colgroup, tbody, col"):
48+
for elem in inner.select("tr, td, th, colgroup, tbody, thead, tfoot, col"):
2149
elem.name = 'div'
2250

2351

@@ -28,50 +56,146 @@ def escape(s):
2856
def unescape(r):
2957
return literal_eval(r)
3058

31-
32-
multirow_re = re.compile(r"^\s*rows=(P<rows>\d+)\s*$")
3359
whitespace_re = re.compile(r'[\r\n]+|\s{2,}')
3460

61+
def clear_ws(s):
62+
return whitespace_re.sub(" ", s.strip())
63+
3564
def escape_table_content(soup):
3665
for item in soup.find_all(["td", "th"]):
37-
escaped = escape(whitespace_re.sub(" ", item.get_text().strip()))
38-
39-
multirow = item.find("div", class_="multirow", recursive=False)
40-
if multirow and multirow.contents and isinstance(multirow.contents[0], Comment):
41-
match = multirow_re.match(str(multirow.contents[0]))
42-
if match:
43-
escaped = f"multirow={match.group('rows')};{escaped}"
44-
66+
escaped = escape(clear_ws(item.get_text()))
4567
item.string = escaped
4668

47-
48-
def fix_htlatex_multirow(df):
49-
rows, cols = df.shape
50-
51-
for col in range(cols):
52-
for row in range(rows):
53-
cell = df.iloc[row, col]
54-
if cell.startswith("multirow="):
55-
pos = cell.find(';')
56-
multirows = int(cell[9:pos])
57-
assert df.iloc[row+1: row+multirows, col].isna().all()
58-
df.iloc[row: row+multirows, col] = cell[pos+1:]
59-
60-
6169
def unescape_table_content(df):
6270
return df.applymap(unescape)
6371

6472

73+
@dataclass
74+
class LayoutCell:
75+
borders: Set[str]
76+
align: Set[str]
77+
header: bool
78+
colspan: int
79+
rowspan: int
80+
span: Set[str]
81+
82+
def __str__(self):
83+
borders = ['border-'+x for x in self.borders]
84+
align = ['align-'+x for x in self.align]
85+
span = ['span-'+x for x in self.span]
86+
header = ["header"] if self.header else []
87+
return ' '.join(borders + align + span + header)
88+
89+
def to_layout(s):
90+
if s == "":
91+
return LayoutCell(set(), set(), False, 1, 1, set())
92+
borders, align, header, colspan, rowspan = s.split(",")
93+
borders = set(borders.split())
94+
align = set(align.split())
95+
header = (header == "True")
96+
colspan = int(colspan)
97+
rowspan = int(rowspan)
98+
return LayoutCell(borders, align, header, colspan, rowspan, set())
99+
100+
101+
def fix_layout(layout):
102+
rowspan = 1
103+
for index, row in layout.iterrows():
104+
colspan = 1
105+
for cell in row:
106+
colspan -= 1
107+
if colspan == 0:
108+
colspan = cell.colspan
109+
if cell.colspan > 1:
110+
if colspan == 1:
111+
cell.span.add("ce")
112+
cell.borders -= {"l", "ll"}
113+
elif colspan == cell.colspan:
114+
cell.span.add("cb")
115+
cell.borders -= {"r", "rr"}
116+
else:
117+
cell.span.add("ci")
118+
cell.borders -= {"l", "ll", "r", "rr"}
119+
for col in layout:
120+
rowspan = 1
121+
for cell in layout[col]:
122+
rowspan -= 1
123+
if rowspan == 0:
124+
rowspan = cell.rowspan
125+
if cell.rowspan > 1:
126+
if rowspan == 1:
127+
cell.span.add("re")
128+
cell.borders -= {"t", "tt"}
129+
elif rowspan == cell.rowspan:
130+
cell.span.add("rb")
131+
cell.borders -= {"b", "bb"}
132+
else:
133+
cell.span.add("ri")
134+
cell.borders -= {"b", "bb", "t", "tt"}
135+
136+
137+
def decouple_layout(df):
138+
split = df.applymap(lambda x: ("", "") if x == "" else x.split(";", 1))
139+
tab = split.applymap(lambda x: x[1])
140+
layout = split.applymap(lambda x: to_layout(x[0]))
141+
fix_layout(layout)
142+
return tab, layout
143+
144+
65145
def fix_table(df):
66146
df = df.fillna(repr(''))
67-
fix_htlatex_multirow(df)
68147
df = df.replace("''", np.NaN).dropna(how='all').dropna(axis='columns', how='all').fillna("''")
69-
return unescape_table_content(df)
148+
df = unescape_table_content(df)
149+
return decouple_layout(df)
150+
151+
152+
def fix_id(s):
153+
return s.replace(".", "-")
154+
155+
156+
def wrap_elem_content(elem, begin, end):
157+
elem.insert(0, NavigableString(begin))
158+
elem.append(NavigableString(end))
70159

71160

72161
def move_out_references(table):
73162
for anchor in table.select('a[href^="#"]'):
74-
anchor.append(NavigableString("[xxref-"+anchor["href"][1:]+"]"))
163+
wrap_elem_content(anchor, f"<ref id='{fix_id(anchor['href'][1:])}'>", "</ref>")
164+
165+
166+
#def move_out_text_styles(table):
167+
# ltx_font = 'ltx_font_'
168+
# font_selector = f'[class*="{ltx_font}"]'
169+
#
170+
# for elem in table.select(f"span{font_selector}, a{font_selector}, em{font_selector}"):
171+
# for c in set(elem.attrs["class"]):
172+
# if c == ltx_font + 'bold':
173+
# wrap_elem_content(elem, "<b>", "</b>")
174+
# elif c == ltx_font + 'italic':
175+
# wrap_elem_content(elem, "<i>", "</i>")
176+
177+
178+
def move_out_styles(table):
179+
ltx_border = 'ltx_border_'
180+
ltx_align = 'ltx_align_'
181+
ltx_th = 'ltx_th'
182+
183+
for elem in table.select('td, th'):
184+
borders = []
185+
align = []
186+
header = False
187+
for c in elem.attrs["class"]:
188+
if c.startswith(ltx_border):
189+
borders.append(c[len(ltx_border):])
190+
elif c.startswith(ltx_align):
191+
align.append(c[len(ltx_align):])
192+
elif c == ltx_th:
193+
header = True
194+
b = ' '.join(borders)
195+
a = ' '.join(align)
196+
colspan = elem.attrs.get("colspan", "1")
197+
rowspan = elem.attrs.get("rowspan", "1")
198+
wrap_elem_content(elem, f"{b},{a},{header},{colspan},{rowspan};", "")
75199

76200

77201
def html2data(table):
@@ -90,60 +214,88 @@ def save_tables(data, outdir):
90214

91215
for num, table in enumerate(data, 1):
92216
filename = f"table_{num:02}.csv"
217+
layout = f"layout_{num:02}.csv"
93218
save_table(table.data, outdir / filename)
94-
metadata.append(dict(filename=filename, caption=table.caption, figure_id=table.figure_id))
219+
save_table(table.layout, outdir / layout)
220+
metadata.append(dict(filename=filename, layout=layout, caption=table.caption, figure_id=table.figure_id))
95221
with open(outdir / "metadata.json", "w") as f:
96222
json.dump(metadata, f)
97223

98224

99-
def deepclone(elem):
100-
return BeautifulSoup(str(elem), "lxml")
101-
102-
103225
def set_ids_by_labels(soup):
104-
captions = soup.select(".caption")
105-
prefix = "tex4ht:label?:"
226+
captions = soup.select(".ltx_caption")
106227
for caption in captions:
107-
el = caption.next_sibling
108-
if isinstance(el, Comment) and el.string.startswith(prefix):
109-
label = el.string[len(prefix):].strip()
110-
for table in caption.parent.select("table"):
228+
fig = caption.parent
229+
if fig.name == "figure" and fig.has_attr("id"):
230+
label = fig.attrs["id"]
231+
for table in fig.select(".ltx_tabular"):
111232
table["data-figure-id"] = label
112233

234+
def is_figure(tag):
235+
return tag.name == "figure"
236+
# classes = tag.attrs.get("class", [])
237+
# return "ltx_figure" in classes or "ltx_float" in classes
238+
239+
def fix_span_tables(soup):
240+
classes = OrderedDict([("ltx_tabular", "table"), ("ltx_tr", "tr"), ("ltx_th", "th"),
241+
("ltx_tbody", "tbody"), ("ltx_thead", "thead"), ("ltx_td", "td"),
242+
("ltx_tfoot", "tfoot")])
243+
244+
query = ','.join(["span." + c for c in classes.keys()])
245+
for elem in soup.select(query):
246+
for k, v in classes.items():
247+
if k in elem.attrs["class"]:
248+
elem.name = v
249+
break
250+
251+
# pandas.read_html treats th differently
252+
# by trying in a few places to get column names
253+
# for now <th>s are changed to <td>s, but we still
254+
# have classes (ltx_th) to distinguish them
255+
def fix_th(soup):
256+
for elem in soup.find_all("th"):
257+
elem.name = "td"
258+
259+
def remove_footnotes(soup):
260+
for elem in soup.select(".ltx_role_footnote"):
261+
elem.extract()
262+
113263

114264
def extract_tables(filename, outdir):
115265
with open(filename, "rb") as f:
116266
html = f.read()
117267
outdir = Path(outdir)
118268
outdir.mkdir(parents=True, exist_ok=True)
119269
soup = BeautifulSoup(html, "lxml", from_encoding="utf-8")
120-
flatten_tables(soup)
121270
set_ids_by_labels(soup)
122-
tables = soup.select("div.tabular")
271+
fix_span_tables(soup)
272+
fix_th(soup)
273+
flatten_tables(soup)
274+
tables = soup.find_all("table", class_="ltx_tabular")
123275

124276
data = []
125277
for table in tables:
126-
table_el = table.find("table")
127-
if table_el is not None:
128-
float_div = table.find_parent("div", class_="float")
129-
#print(table)
130-
move_out_references(table)
131-
escape_table_content(table)
132-
#print(table)
133-
tab = html2data(table)
134-
if tab is None:
135-
continue
136-
137-
tab = fix_table(tab)
138-
139-
caption = None
140-
if float_div is not None:
141-
float_div = deepclone(float_div)
142-
for t in float_div.find_all("table"):
143-
t.extract()
144-
caption = float_div.get_text()
145-
figure_id = table_el.get("data-figure-id")
146-
data.append(Tabular(tab, caption, figure_id))
278+
if table.find_parent(class_="ltx_authors") is not None:
279+
continue
280+
281+
float_div = table.find_parent(is_figure)
282+
remove_footnotes(table)
283+
move_out_references(table)
284+
move_out_styles(table)
285+
escape_table_content(table)
286+
tab = html2data(table)
287+
if tab is None:
288+
continue
289+
290+
tab, layout = fix_table(tab)
291+
292+
caption = None
293+
if float_div is not None:
294+
cap_el = float_div.find("figcaption")
295+
if cap_el is not None:
296+
caption = clear_ws(cap_el.get_text())
297+
figure_id = table.get("data-figure-id")
298+
data.append(Tabular(tab, layout, caption, figure_id))
147299

148300
save_tables(data, outdir)
149301

latex2html.sh

Lines changed: 16 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,9 +1,23 @@
11
#!/usr/bin/env bash
22
OUTNAME="$1"
33
echo $OUTNAME
4-
RO_SOURCE_DIR="/files/ro-source"
4+
SOURCE_DIR="/files/ro-source"
55
OUTPUT_DIR="/files/htmls"
66

7-
timeout -s KILL 120 engrafo "$RO_SOURCE_DIR" /files/output
7+
cd "$SOURCE_DIR"
8+
9+
if [ -f "$SOURCE_DIR/ms.tex" ]
10+
then
11+
MAINTEX="$SOURCE_DIR/ms.tex"
12+
elif [ -f "$SOURCE_DIR/main.tex" ]
13+
then
14+
MAINTEX="$SOURCE_DIR/main.tex"
15+
elif [ -f "$SOURCE_DIR/00_main.tex" ]
16+
then
17+
MAINTEX="$SOURCE_DIR/00_main.tex"
18+
else
19+
MAINTEX=$(find $SOURCE_DIR -maxdepth 1 -type f -iname "*.tex" -print0 | xargs -0 grep -l documentclass | head -1)
20+
fi
21+
timeout -s KILL 300 engrafo "$MAINTEX" /files/output
822

923
cp /files/output/index.html "$OUTPUT_DIR/$OUTNAME"

0 commit comments

Comments
 (0)