Skip to content

Commit de158a6

Browse files
mkardasUbuntu
authored andcommitted
Fixes
1 parent 29ee077 commit de158a6

File tree

4 files changed

+35
-32
lines changed

4 files changed

+35
-32
lines changed

extract_tables.py

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -140,8 +140,6 @@ def fix_layout(layout):
140140
whitespace_tag_re = re.compile(r"<(bold|italic|red|green|blue)>(\s*)</\1>")
141141
dummy_close_tag_re = re.compile(r"</(bold|italic|red|green|blue)>(\s*)<\1>")
142142
def clear_cell(s):
143-
if "BP by Lyapunov equatio" in s:
144-
print(s)
145143
s = whitespace_tag_re.sub(r"\2", s)
146144
s = dummy_close_tag_re.sub(r"\2", s)
147145
return s.strip()

sota_extractor2/data/doc_utils.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -51,7 +51,8 @@ def put_footnote_anchors(soup):
5151
for elem in soup.select('.ltx_role_footnote > .ltx_note_mark'):
5252
ft = elem.parent
5353
id_str = ft.get('id')
54-
elem.string = f" xxref-{_simplify_anchor(id_str)} "
54+
if id_str:
55+
elem.string = f" xxref-{_simplify_anchor(id_str)} "
5556

5657
for elem in soup.select('.ltx_note_content > .ltx_tag_note'):
5758
ft = elem.find_parent(class_="ltx_role_footnote")

sota_extractor2/data/elastic.py

Lines changed: 32 additions & 28 deletions
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,7 @@
1212
from sota_extractor2.data.doc_utils import get_text, content_in_section, group_content, read_html, put_dummy_anchors, clean_abstract
1313
from .. import config
1414
from pathlib import Path
15+
import sys
1516

1617

1718
def setup_default_connection():
@@ -200,38 +201,41 @@ def parse_html(cls, soup, paper_id):
200201

201202
fragments = Fragments()
202203
doc = soup.find("article")
203-
footnotes = doc.select(".ltx_role_footnote > .ltx_note_outer")
204-
for ft in footnotes:
205-
ft.extract()
206-
207-
idx = 0
208-
for idx, idx2, section_header, content in group_content(doc):
209-
content = content.strip()
210-
if p.abstract == "" and "abstract" in section_header.lower():
211-
p.abstract = clean_abstract(content)
212-
else:
204+
if doc:
205+
footnotes = doc.select(".ltx_role_footnote > .ltx_note_outer")
206+
for ft in footnotes:
207+
ft.extract()
208+
209+
idx = 0
210+
for idx, idx2, section_header, content in group_content(doc):
211+
content = content.strip()
212+
if p.abstract == "" and "abstract" in section_header.lower():
213+
p.abstract = clean_abstract(content)
214+
else:
215+
order = (idx + 1) * 1000 + idx2
216+
f = Fragment(
217+
paper_id=paper_id,
218+
order=order,
219+
header=section_header,
220+
text=content,
221+
meta={'id': f"{paper_id}-{order}"}
222+
)
223+
fragments.append(f)
224+
idx += 1
225+
idx2 = 0
226+
for ft in footnotes:
213227
order = (idx + 1) * 1000 + idx2
214228
f = Fragment(
215-
paper_id=paper_id,
216-
order=order,
217-
header=section_header,
218-
text=content,
219-
meta={'id': f"{paper_id}-{order}"}
229+
paper_id=paper_id,
230+
order=order,
231+
header="xxanchor-footnotes Footnotes",
232+
text=get_text(ft),
233+
meta={'id': f"{paper_id}-{order}"}
220234
)
221235
fragments.append(f)
222-
idx += 1
223-
idx2 = 0
224-
for ft in footnotes:
225-
order = (idx + 1) * 1000 + idx2
226-
f = Fragment(
227-
paper_id=paper_id,
228-
order=order,
229-
header="xxanchor-footnotes Footnotes",
230-
text=get_text(ft),
231-
meta={'id': f"{paper_id}-{order}"}
232-
)
233-
fragments.append(f)
234-
idx2 += 1
236+
idx2 += 1
237+
else:
238+
print(f"No article found for {paper_id}", file=sys.stderr)
235239
p.fragments = fragments
236240
return p
237241

sota_extractor2/data/table.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -42,7 +42,7 @@ def str2cell(s):
4242

4343
def read_str_csv(filename):
4444
try:
45-
df = pd.read_csv(filename, header=None, dtype=str).fillna('')
45+
df = pd.read_csv(filename, header=None, dtype=str, keep_default_na=False)
4646
except pd.errors.EmptyDataError:
4747
df = pd.DataFrame()
4848
return df

0 commit comments

Comments
 (0)