Skip to content

Commit 29ee077

Browse files
committed
Adapt tests to layouts and styles
* compare all output files * sort styles to make output deterministic * fix repeated anchors to appendix and bibliography * flush output at the end of processing
1 parent 0b26d64 commit 29ee077

File tree

11 files changed

+44
-13
lines changed

11 files changed

+44
-13
lines changed

Makefile

Lines changed: 1 addition & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -20,13 +20,11 @@ all: $(ANNOTATIONS_DIR)/pdfs-urls.csv $(ANNOTATIONS_DIR)/sources-urls.csv extrac
2020

2121
.PHONY: test
2222
test: DATA_DIR = test/data
23-
test: TABLE_FILE = $(TABLES_DIR)/paper/table_01.csv
2423
test:
2524
mkdir -p $(ARCHIVES_DIR)
2625
tar czf $(ARCHIVES_DIR)/paper.gz -C test/src .
2726
$(MAKE) DATA_DIR=$(DATA_DIR) --always-make extract_all
28-
cat $(TABLE_FILE)
29-
diff $(TABLE_FILE) test/src/table_01.csv
27+
diff -r $(TABLES_DIR) test/expected
3028

3129
.PHONY: extract_all extract_texts extract_tables fix_htmls_all convert_all unpack_all
3230

extract_tables.py

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -80,9 +80,9 @@ class LayoutCell:
8080
span: Set[str]
8181

8282
def __str__(self):
83-
borders = ['border-'+x for x in self.borders]
84-
align = ['align-'+x for x in self.align]
85-
span = ['span-'+x for x in self.span]
83+
borders = ['border-'+x for x in sorted(list(self.borders))]
84+
align = ['align-'+x for x in sorted(list(self.align))]
85+
span = ['span-'+x for x in sorted(list(self.span))]
8686
header = ["header"] if self.header else []
8787
return ' '.join(borders + align + span + header)
8888

@@ -296,7 +296,7 @@ def set_ids_by_labels(soup):
296296

297297
alg_id_re = re.compile(r"^alg(orithm)?[0-9]+")
298298
def perhaps_not_tabular(table, float_div):
299-
classes = float_div.attrs.get("class")
299+
classes = float_div.attrs.get("class", [])
300300
if 'ltx_table' in classes:
301301
return False
302302
if 'ltx_figure' in classes:

sota_extractor2/data/doc_utils.py

Lines changed: 10 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -20,7 +20,7 @@ def _insert_anchor(el, anchor_id, prefix="xxanchor"):
2020

2121
def put_dummy_anchors(soup):
2222
for elem in soup.select(
23-
'.ltx_appendix, .ltx_bibliography, .ltx_bibitem, ' + \
23+
'.ltx_bibitem, ' + \
2424
'.ltx_figure, .ltx_float, ' + \
2525
'.ltx_picture, .ltx_theorem'):
2626
id_str = elem.get('id', '')
@@ -248,7 +248,7 @@ def flush(self):
248248
yield r
249249

250250
def new_section(self, header_el):
251-
if not self.section_output: # output (possibly) empty section so header won't be lost
251+
if not self.section_output or self.out: # output (possibly) empty section so header won't be lost
252252
yield from self.flush()
253253
self.section_output = False
254254
self.in_section = True
@@ -260,6 +260,8 @@ def append(self, el):
260260
t = get_text(el).strip()
261261
if t != "":
262262
self.out.append(t)
263+
return True
264+
return False
263265

264266
def group_content(self, doc):
265267
for el in walk(doc):
@@ -269,10 +271,14 @@ def group_content(self, doc):
269271
elif el.name == "h1":
270272
continue
271273
elif 'ltx_para' in classes or el.name == "figure" or 'ltx_bibitem' in classes:
272-
self.append(el)
273-
yield from self.flush()
274+
has_content = self.append(el)
275+
if has_content:
276+
yield from self.flush()
274277
else:
275278
self.append(el)
279+
self.in_section = True
280+
if not self.section_output or self.out:
281+
yield from self.flush()
276282

277283

278284
def group_content(doc):

sota_extractor2/data/elastic.py

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -207,8 +207,6 @@ def parse_html(cls, soup, paper_id):
207207
idx = 0
208208
for idx, idx2, section_header, content in group_content(doc):
209209
content = content.strip()
210-
if content == "":
211-
continue
212210
if p.abstract == "" and "abstract" in section_header.lower():
213211
p.abstract = clean_abstract(content)
214212
else:

test/expected/paper/layout_01.csv

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,5 @@
1+
border-tt align-left,border-tt align-center,border-tt align-right
2+
border-t align-left,border-t align-center,border-t align-right
3+
align-left,align-center,align-right
4+
border-t align-left,border-t align-center,border-t align-right
5+
border-bb align-left,border-bb align-center,border-bb align-right

test/expected/paper/layout_02.csv

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,4 @@
1+
border-tt align-left,border-tt align-center,border-tt align-right
2+
border-t align-left,border-t align-center,border-t align-right
3+
align-left,align-center,align-right
4+
border-bb border-t align-left,border-bb border-t align-center,border-bb border-t

test/expected/paper/metadata.json

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
[{"filename": "table_01.csv", "layout": "layout_01.csv", "caption": "Table 1: A table.", "figure_id": "S3.T1"}, {"filename": "table_02.csv", "layout": "layout_02.csv", "caption": "Table 2: A table.", "figure_id": "S3.T2"}]
File renamed without changes.

test/expected/paper/table_02.csv

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,4 @@
1+
<bold>bold text</bold>,<italic>italic text</italic>,<italic><bold>bold italic text</bold></italic>
2+
<red>red text</red>,<green>green text</green>,<blue>blue text</blue>
3+
<bold>5.4%</bold>,<italic>3.8%</italic>,<bold>11.2</bold>±0.15
4+
<bold>an <italic>italic</italic> text inside bold</bold>,<red><bold>bold red</bold></red>,

test/expected/paper/text.json

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
{"title":"DILBERT: Distilling Inner Latent BERT variables","authors":"John Doe","abstract":"In this paper we achieve state-of-the-art performance in random number generation.","fragments":[{"paper_id":"paper","order":1000,"header":"xxanchor-S1 1 Introduction","text":""},{"paper_id":"paper","order":2000,"header":"xxanchor-S2 2 Model","text":""},{"paper_id":"paper","order":3000,"header":"xxanchor-S2SS1 2.1 Preprocessing","text":""},{"paper_id":"paper","order":4000,"header":"xxanchor-S2SS2 2.2 Architecture","text":""},{"paper_id":"paper","order":5000,"header":"xxanchor-S3 3 Experiments","text":"In this section we present Table xxref-S3T1."},{"paper_id":"paper","order":5001,"header":"xxanchor-S3 3 Experiments","text":"xxtable-xxanchor-S3T1 Table 1: A table."},{"paper_id":"paper","order":5002,"header":"xxanchor-S3 3 Experiments","text":"xxtable-xxanchor-S3T2 Table 2: A table."}]}

0 commit comments

Comments
 (0)