Skip to content

Commit 969717c

Browse files
committed
Escape tables and fix htlatex multirow
1 parent b54a935 commit 969717c

File tree

2 files changed

+66
-14
lines changed

2 files changed

+66
-14
lines changed

extract-all.sh

Lines changed: 0 additions & 6 deletions
This file was deleted.

extract-tables.py

Lines changed: 66 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -1,11 +1,14 @@
11
#!/usr/bin/env python
22

33
import sys
4-
from bs4 import BeautifulSoup
4+
from bs4 import BeautifulSoup, Comment
55
import fire
66
from pathlib import Path
77
import pandas as pd
8+
import numpy as np
89
import json
10+
import re
11+
from ast import literal_eval
912

1013

1114
class Tabular:
@@ -21,15 +24,64 @@ def flatten_tables(soup):
2124
for elem in inner.select("tr, td, colgroup, tbody, col"):
2225
elem.name = 'div'
2326

24-
def html2data(filename, table):
27+
28+
def escape(s):
29+
return repr(s)
30+
31+
32+
def unescape(r):
33+
return literal_eval(r)
34+
35+
36+
multirow_re = re.compile(r"^\s*rows=(P<rows>\d+)\s*$")
37+
whitespace_re = re.compile(r'[\r\n]+|\s{2,}')
38+
39+
def escape_table_content(soup):
40+
for item in soup.find_all(["td", "th"]):
41+
escaped = escape(whitespace_re.sub(" ", item.get_text().strip()))
42+
43+
multirow = item.find("div", class_="multirow", recursive=False)
44+
if multirow and multirow.contents and isinstance(multirow.contents[0], Comment):
45+
match = multirow_re.match(str(multirow.contents[0]))
46+
if match:
47+
escaped = f"multirow={match.group('rows')};{escaped}"
48+
49+
item.string = escaped
50+
51+
52+
def fix_htlatex_multirow(df):
53+
rows, cols = df.shape
54+
55+
for col in range(cols):
56+
for row in range(rows):
57+
cell = df.iloc[row, col]
58+
if cell.startswith("multirow="):
59+
pos = cell.find(';')
60+
multirows = int(cell[9:pos])
61+
assert df.iloc[row+1: row+multirows, col].isna().all()
62+
df.iloc[row: row+multirows, col] = cell[pos+1:]
63+
64+
65+
def unescape_table_content(df):
66+
return df.applymap(unescape)
67+
68+
69+
def fix_table(df):
70+
df = df.fillna(repr(''))
71+
fix_htlatex_multirow(df)
72+
df = df.replace("''", np.NaN).dropna(how='all').dropna(axis='columns', how='all').fillna("''")
73+
return unescape_table_content(df)
74+
75+
76+
def html2data(table):
2577
data = pd.read_html(str(table), match='')
2678
if len(data) > 1:
27-
raise ValueError(f"<table> element in '{filename}' contains wrong number of tables: {len(data)}")
79+
raise ValueError(f"<table> element contains wrong number of tables: {len(data)}")
2880
return data[0] if len(data) == 1 else None
2981

3082

3183
def save_table(data, filename):
32-
data.dropna(how='all').dropna(axis='columns', how='all').to_csv(filename, header=None, index=None)
84+
data.to_csv(filename, header=None, index=None)
3385

3486

3587
def save_tables(data, outdir):
@@ -44,25 +96,31 @@ def save_tables(data, outdir):
4496

4597

4698
def deepclone(elem):
47-
return BeautifulSoup(str(elem), "html.parser")
99+
return BeautifulSoup(str(elem), "lxml")
48100

49101

50102
def extract_tables(filename, outdir):
51103
with open(filename, "rb") as f:
52104
html = f.read()
53-
outdir = Path(outdir)
54-
soup = BeautifulSoup(html, "html.parser")
105+
outdir = Path(outdir) / Path(filename).stem
106+
outdir.mkdir(parents=True, exist_ok=True)
107+
soup = BeautifulSoup(html, "lxml")
55108
flatten_tables(soup)
56109
tables = soup.select("div.tabular")
57110

58111
data = []
59112
for table in tables:
60113
if table.find("table") is not None:
61114
float_div = table.find_parent("div", class_="float")
62-
tab = html2data(filename, table)
115+
#print(table)
116+
escape_table_content(table)
117+
#print(table)
118+
tab = html2data(table)
63119
if tab is None:
64120
continue
65121

122+
tab = fix_table(tab)
123+
66124
caption = None
67125
if float_div is not None:
68126
float_div = deepclone(float_div)

0 commit comments

Comments
 (0)