Skip to content

Commit 0b26d64

Browse files
committed
Clean redundant tags
1 parent 1df3227 commit 0b26d64

File tree

2 files changed

+15
-2
lines changed

2 files changed

+15
-2
lines changed

extract_tables.py

Lines changed: 14 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -134,9 +134,22 @@ def fix_layout(layout):
134134
cell.borders -= {"b", "bb", "t", "tt"}
135135

136136

137+
# does not deal with nested tags
138+
# f.e., </bold></red><red><bold>
139+
# or <bold><bold>
140+
whitespace_tag_re = re.compile(r"<(bold|italic|red|green|blue)>(\s*)</\1>")
141+
dummy_close_tag_re = re.compile(r"</(bold|italic|red|green|blue)>(\s*)<\1>")
142+
def clear_cell(s):
143+
if "BP by Lyapunov equatio" in s:
144+
print(s)
145+
s = whitespace_tag_re.sub(r"\2", s)
146+
s = dummy_close_tag_re.sub(r"\2", s)
147+
return s.strip()
148+
149+
137150
def decouple_layout(df):
138151
split = df.applymap(lambda x: ("", "") if x == "" else x.split(";", 1))
139-
tab = split.applymap(lambda x: x[1])
152+
tab = split.applymap(lambda x: clear_cell(x[1]))
140153
layout = split.applymap(lambda x: to_layout(x[0]))
141154
fix_layout(layout)
142155
return tab, layout

sota_extractor2/config.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -10,7 +10,7 @@
1010

1111
# otherwise use this files
1212
data = Path("/mnt/efs/pwc/data")
13-
goldtags_dump = data / "dumps" / "goldtags-2019.07.16_2214.json.gz"
13+
goldtags_dump = data / "dumps" / "goldtags-2019.07.31_1454-htlatex-latexml.json.gz"
1414

1515

1616
elastic = dict(hosts=['localhost'], timeout=20)

0 commit comments

Comments
 (0)