Skip to content

Commit 77614ae

Browse files
committed
Extract references and table labels
1 parent 4e9cae8 commit 77614ae

File tree

2 files changed

+26
-6
lines changed

2 files changed

+26
-6
lines changed

extract-tables.py renamed to extract_tables.py

Lines changed: 24 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
#!/usr/bin/env python
22

33
import sys
4-
from bs4 import BeautifulSoup, Comment
4+
from bs4 import BeautifulSoup, Comment, NavigableString
55
import fire
66
from pathlib import Path
77
import pandas as pd
@@ -69,6 +69,11 @@ def fix_table(df):
6969
return unescape_table_content(df)
7070

7171

72+
def move_out_references(table):
73+
for anchor in table.select('a[href^="#"]'):
74+
anchor.append(NavigableString("[xxref-"+anchor["href"][1:]+"]"))
75+
76+
7277
def html2data(table):
7378
data = pd.read_html(str(table), match='')
7479
if len(data) > 1:
@@ -86,7 +91,7 @@ def save_tables(data, outdir):
8691
for num, table in enumerate(data, 1):
8792
filename = f"table_{num:02}.csv"
8893
save_table(table.data, outdir / filename)
89-
metadata.append(dict(filename=filename, caption=table.caption))
94+
metadata.append(dict(filename=filename, caption=table.caption, figure_id=table.figure_id))
9095
with open(outdir / "metadata.json", "w") as f:
9196
json.dump(metadata, f)
9297

@@ -95,20 +100,34 @@ def deepclone(elem):
95100
return BeautifulSoup(str(elem), "lxml")
96101

97102

103+
def set_ids_by_labels(soup):
104+
captions = soup.select(".caption")
105+
prefix = "tex4ht:label?:"
106+
for caption in captions:
107+
el = caption.next_sibling
108+
if isinstance(el, Comment) and el.string.startswith(prefix):
109+
label = el.string[len(prefix):].strip()
110+
for table in caption.parent.select("table"):
111+
table["data-figure-id"] = label
112+
113+
98114
def extract_tables(filename, outdir):
99115
with open(filename, "rb") as f:
100116
html = f.read()
101117
outdir = Path(outdir) / Path(filename).stem
102118
outdir.mkdir(parents=True, exist_ok=True)
103119
soup = BeautifulSoup(html, "lxml")
104120
flatten_tables(soup)
121+
set_ids_by_labels(soup)
105122
tables = soup.select("div.tabular")
106123

107124
data = []
108125
for table in tables:
109-
if table.find("table") is not None:
126+
table_el = table.find("table")
127+
if table_el is not None:
110128
float_div = table.find_parent("div", class_="float")
111129
#print(table)
130+
move_out_references(table)
112131
escape_table_content(table)
113132
#print(table)
114133
tab = html2data(table)
@@ -123,8 +142,8 @@ def extract_tables(filename, outdir):
123142
for t in float_div.find_all("table"):
124143
t.extract()
125144
caption = float_div.get_text()
126-
127-
data.append(Tabular(tab, caption))
145+
figure_id = table_el.get("data-figure-id")
146+
data.append(Tabular(tab, caption, figure_id))
128147

129148
save_tables(data, outdir)
130149

tabular.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -4,12 +4,13 @@
44

55

66
class Tabular:
7-
def __init__(self, data, caption):
7+
def __init__(self, data, caption, figure_id=None):
88
self.data = data
99
self.cell_tags = pd.DataFrame().reindex_like(data).fillna('')
1010
self.datasets = set()
1111
self.metrics = set()
1212
self.caption = caption
13+
self.figure_id = figure_id
1314

1415
def mark_with_metric(self, metric_name):
1516
self.metrics.add(metric_name)

0 commit comments

Comments
 (0)