Migrate structure annotations

mkardas · mkardas · commit a5a776a08c0e · 2019-07-31T16:16:32.000+02:00
* remove empty tables from extraction results
* remove undefine latex commands from output cells
* remove algorithms and graphics
* keep arxiv versions in paper collection
* ignore arxiv version when searching in paper collection
* keep cell's raw_value to be able to locate references
* during migration of structure annotations match tables
using cell contexts (consider 8-connected neighbourhood of cell first,
then it's direct neighbours in row, column and finally cell value only)
diff --git a/extract_tables.py b/extract_tables.py
@@ -149,6 +149,9 @@ def fix_table(df):
     return decouple_layout(df)
 
 
+def is_table_empty(df):
+    return (df.applymap(lambda x: x.strip()).values == "").all()
+
 def fix_id(s):
     return s.replace(".", "-")
 
@@ -198,6 +201,11 @@ def move_out_styles(table):
         wrap_elem_content(elem, f"{b},{a},{header},{colspan},{rowspan};", "")
 
 
+def remove_ltx_errors(soup):
+    for span in soup.select('span.ltx_ERROR'):
+        span.extract()
+
+
 def html2data(table):
     data = pd.read_html(str(table), match='')
     if len(data) > 1:
@@ -231,6 +239,26 @@ def set_ids_by_labels(soup):
             for table in fig.select(".ltx_tabular"):
                 table["data-figure-id"] = label
 
+
+alg_id_re = re.compile(r"^alg(orithm)?[0-9]+")
+def perhaps_not_tabular(table, float_div):
+    classes = float_div.attrs.get("class")
+    if 'ltx_table' in classes:
+        return False
+    if 'ltx_figure' in classes:
+        if table.find("img", class_="ltx_graphics"):
+            return True
+    if 'ltx_float' in classes:
+        if 'biography' in classes:
+            return True
+        if 'ltx_float_algorithm':
+            return True
+        if 'ltx_lstlisting':
+            return True
+        if float_div.id and alg_id_re.match(float_div.id):
+            return True
+    return False
+
 def is_figure(tag):
     return tag.name == "figure"
 #    classes = tag.attrs.get("class", [])
@@ -270,6 +298,7 @@ def extract_tables(filename, outdir):
     set_ids_by_labels(soup)
     fix_span_tables(soup)
     fix_th(soup)
+    remove_ltx_errors(soup)
     flatten_tables(soup)
     tables = soup.find_all("table", class_="ltx_tabular")
 
@@ -279,6 +308,8 @@ def extract_tables(filename, outdir):
             continue
 
         float_div = table.find_parent(is_figure)
+        if float_div and perhaps_not_tabular(table, float_div):
+            continue
         remove_footnotes(table)
         move_out_references(table)
         move_out_styles(table)
@@ -288,6 +319,8 @@ def extract_tables(filename, outdir):
             continue
 
         tab, layout = fix_table(tab)
+        if is_table_empty(tab):
+            continue
 
         caption = None
         if float_div is not None:
diff --git a/sota_extractor2/data/paper_collection.py b/sota_extractor2/data/paper_collection.py
@@ -11,6 +11,7 @@
 class Paper:
     def __init__(self, paper_id, text, tables, annotations):
         self.paper_id = paper_id
+        self.arxiv_no_version = remove_arxiv_version(paper_id)
         if text is not None:
             self.text = text
         else:
@@ -32,20 +33,18 @@ def remove_arxiv_version(arxiv_id):
 def _load_texts(path, jobs):
     files = list(path.glob("**/text.json"))
     texts = Parallel(n_jobs=jobs, prefer="processes")(delayed(PaperText.from_file)(f) for f in files)
-    return {remove_arxiv_version(text.meta.id): text for text in texts}
+    return {text.meta.id: text for text in texts}
 
 
 def _load_tables(path, annotations, jobs):
     files = list(path.glob("**/metadata.json"))
-    tables = Parallel(n_jobs=jobs, prefer="processes")(delayed(read_tables)(f.parent, annotations.get(remove_arxiv_version(f.parent.name))) for f in files)
-    return {remove_arxiv_version(f.parent.name): tbls for f, tbls in zip(files, tables)}
+    tables = Parallel(n_jobs=jobs, prefer="processes")(delayed(read_tables)(f.parent, annotations.get(f.parent.name)) for f in files)
+    return {f.parent.name: tbls for f, tbls in zip(files, tables)}
+
 
 def _load_annotated_papers(path):
-    dump = load_gql_dump(path, compressed=False)["allPapers"]
-    annotations = {}
-    for a in dump:
-        arxiv_id = remove_arxiv_version(a.arxiv_id)
-        annotations[arxiv_id] = a
+    dump = load_gql_dump(path, compressed=path.suffix == ".gz")["allPapers"]
+    annotations = {a.arxiv_id: a for a in dump}
     return annotations
 
 
@@ -74,12 +73,18 @@ def from_files(cls, path, annotations_path=None, load_texts=True, load_tables=Tr
         papers = [Paper(k, texts.get(k), tables.get(k, []), annotations.get(k)) for k in outer_join]
         return cls(papers)
 
-    def get_by_id(self, paper_id):
-        paper_id = remove_arxiv_version(paper_id)
-        for p in self.data:
-            if p.paper_id == paper_id:
-                return p
-        return None
+    def get_by_id(self, paper_id, ignore_version=True):
+        if ignore_version:
+            paper_id = remove_arxiv_version(paper_id)
+            for p in self.data:
+                if p.arxiv_no_version == paper_id:
+                    return p
+            return None
+        else:
+            for p in self.data:
+                if p.paper_id == paper_id:
+                    return p
+            return None
 
     @classmethod
     def cells_gold_tags_legend(cls):
diff --git a/sota_extractor2/data/table.py b/sota_extractor2/data/table.py
@@ -1,4 +1,5 @@
 import pandas as pd
+import numpy as np
 import json
 from pathlib import Path
 import re
@@ -9,6 +10,7 @@
 @dataclass
 class Cell:
     value: str
+    raw_value: str
     gold_tags: str = ''
     refs: List[str] = field(default_factory=list)
     layout: str = ''
@@ -36,7 +38,7 @@ def extract_references(s):
 
 def str2cell(s):
     value, refs = extract_references(s)
-    return Cell(value=value, refs=refs)
+    return Cell(value=value, raw_value=s, refs=refs)
 
 def read_str_csv(filename):
     try:
@@ -49,34 +51,51 @@ def read_str_csv(filename):
 
 
 class Table:
-    def __init__(self, df, layout, caption=None, figure_id=None, annotations=None, old_name=None):
+    def __init__(self, df, layout, caption=None, figure_id=None, annotations=None, old_name=None, guessed_tags=None):
         self.df = df
         self.caption = caption
         self.figure_id = figure_id
         self.df = df.applymap(str2cell)
         self.old_name = old_name
 
         if layout is not None:
-            self.layout = layout
+            #self.layout = layout
             for r, row in layout.iterrows():
                 for c, cell in enumerate(row):
                     self.df.iloc[r,c].layout = cell
 
         if annotations is not None:
             self.gold_tags = annotations.gold_tags.strip()
-            tags = annotations.matrix_gold_tags
-            gt_rows = len(annotations.matrix_gold_tags)
-            if gt_rows > 0:
-                gt_cols = len(annotations.matrix_gold_tags[0])
+            self.dataset_text = annotations.dataset_text.strip()
+            self.notes = annotations.notes.strip()
+            if guessed_tags is not None:
+                tags = guessed_tags.values
+            else:
+                tags = annotations.matrix_gold_tags
+            gt_rows = len(tags)
+            if gt_rows == 0 and len(self.df) > 0:
+                #print(f"Gold tags size mismatch: 0 vs {len(self.df)} in old name {old_name}")
+                self.old_name = None
+            elif gt_rows > 0:
+                gt_cols = len(tags[0])
                 if self.df.shape != (0,0) and self.df.shape == (gt_rows, gt_cols):
                     for r, row in enumerate(tags):
                         for c, cell in enumerate(row):
                             self.df.iloc[r,c].gold_tags = cell.strip()
+                else:
+                    if guessed_tags is not None:
+                        print(f"Gold tags size mismatch: {gt_rows},{gt_cols} vs {self.df.shape}")
+                #    print(f"Gold tags size mismatch: {gt_rows},{gt_cols} vs {self.df.shape}")
+                #    print(annotations.matrix_gold_tags)
+                #    print(self.df.applymap(lambda c:c.value))
+                    self.old_name = None
         else:
             self.gold_tags = ''
+            self.dataset_text = ''
+            self.notes = ''
 
     @classmethod
-    def from_file(cls, path, metadata, annotations=None, match_name=None):
+    def from_file(cls, path, metadata, annotations=None, match_name=None, guessed_tags=None):
         path = Path(path)
         filename = path / metadata['filename']
         df = read_str_csv(filename)
@@ -89,7 +108,7 @@ def from_file(cls, path, metadata, annotations=None, match_name=None):
             table_ann = table_ann[0]
         else:
             table_ann = None
-        return cls(df, layout, metadata.get('caption'), metadata.get('figure_id'), table_ann, match_name)
+        return cls(df, layout, metadata.get('caption'), metadata.get('figure_id'), table_ann, match_name, guessed_tags)
 
     def display(self):
         display_table(self.df.applymap(lambda x: x.value).values, self.df.applymap(lambda x: x.gold_tags).values)
@@ -104,11 +123,15 @@ def display(self):
 import string
 from collections import Counter
 
+figure_prefix_re = re.compile('^(table|figure)\s+([0-9]+|[ivxl]+)?')
 punctuation_table = str.maketrans('', '', string.punctuation)
 def normalize_string(s):
     if s is None:
         return ""
-    return unidecode(s.strip().lower().replace(' ', '')).translate(punctuation_table)
+
+    s = s.strip().lower()
+    s = figure_prefix_re.sub('', s).strip()
+    return unidecode(s.replace('\xa0', '').replace(' ', '')).translate(punctuation_table)
 
 def _remove_almost_empty_values(d):
     return {k:v for k,v in d.items() if len(v) >= 10}
@@ -128,11 +151,106 @@ def _match_tables_by_captions(annotations, metadata):
     old_captions_reverse = {v:k for k,v in old_captions.items()}
     return {new_name:old_captions_reverse[caption] for new_name, caption in new_captions.items() if caption in old_captions_reverse}
 
+def normalize_cell(s):
+    #s = reference_re.sub(' [] ', s)
+    return normalize_string(s)
+
+# begin of guess annotations mapping
+def create_cell_contexts(df):
+    cell_context = df.values
+    cells = np.pad(cell_context, 1, mode='constant', constant_values='')
+
+    slices = [slice(None, -2), slice(1,-1), slice(2, None)]
+
+    row_context = np.stack([cells[1:-1, s] for s in slices], axis=-1)
+    col_context = np.stack([cells[s, 1:-1] for s in slices], axis=-1)
+    box_context = np.stack([cells[s1, s2] for s1 in slices for s2 in slices], axis=-1)
+    return box_context, row_context, col_context, cell_context[...,None]
+
+def map_context(context, values):
+    ctx_len = context.shape[-1]
+    mapping = {}
+    for ctx, val in zip(context.reshape((-1, ctx_len)), values.reshape(-1)):
+        mapping.setdefault(tuple(ctx), set()).add(val)
+    return mapping
+
+REANNOTATE_TAG = 'reannotate'
+
+def guess_annotations(old_table, gold_tags, new_table):
+    df = pd.DataFrame().reindex_like(new_table).fillna(REANNOTATE_TAG)
+    if old_table.empty:
+        return 0, df
+    old_contexts = create_cell_contexts(old_table)
+    old_mappings = [map_context(ctx, gold_tags.values) for ctx in old_contexts]
+    new_contexts = create_cell_contexts(new_table)
+
+    rows, cols = new_table.shape
+    matched = 0
+    for row in range(rows):
+        for col in range(cols):
+            for mapping, context in zip(old_mappings, new_contexts):
+                ctx = tuple(context[row, col])
+                values = mapping.get(ctx, set())
+                if len(values) == 1:
+                    (val,) = values
+                    df.iloc[row, col] = val
+                    matched += 1
+                    break
+    return matched, df
+
+# end of guess annotations mapping
+
+
+def same_table(old_table, new_table):
+    return old_table.equals(new_table)
+
+DEB_PAPER="1607.00036v2"
+
+def deb(path, old_name, old_table, new_name, new_table):
+    if path.name == DEB_PAPER and old_name == "table_02.csv" == new_name:
+        print(old_table)
+        print(new_table)
+
+def _match_tables_by_content(path, annotations, metadata):
+    if annotations is None:
+        return {}, {}
+    old_tables = {x.name: (pd.DataFrame(x.matrix).applymap(normalize_cell), pd.DataFrame(x.matrix_gold_tags)) for x in annotations.table_set}
+    new_tables = {m['filename']: Table.from_file(path, m, None, None).df.applymap(lambda c: normalize_cell(c.value)) for m in metadata}
+    matched = {}
+    new_tags = {}
+    for new_name, new_table in new_tables.items():
+        max_hits = 0
+        matched_name = None
+        size = np.prod(new_table.shape)
+        guessed_tags = None
+        for old_name, (old_table, gold_tags) in old_tables.items():
+            hits, tags = guess_annotations(old_table, gold_tags, new_table)
+            if hits > max_hits:
+                max_hits = hits
+                matched_name = old_name
+                guessed_tags = tags
+        if max_hits > size / 2:
+            matched[new_name] = matched_name
+            new_tags[new_name] = guessed_tags
+            #deb(path, old_name, old_table, new_name, new_table)
+            #if same_table(old_table, new_table):
+            #    if new_name in matched:
+            #        print(f"Multiple matches for {path}/{new_name}: {matched[new_name]}, {old_name}")
+            #    else:
+            #        matched[new_name] = old_name
+    return matched, new_tags
 ####
 
 def read_tables(path, annotations):
     path = Path(path)
     with open(path / "metadata.json", "r") as f:
         metadata = json.load(f)
-    _match_names = _match_tables_by_captions(annotations, metadata)
-    return [Table.from_file(path, m, annotations, match_name=_match_names.get(m["filename"])) for m in metadata]
+    _matched_names_by_captions = {} #_match_tables_by_captions(annotations, metadata)
+    _matched_names_by_content, _guessed_tags = _match_tables_by_content(path, annotations, metadata)
+    _matched_names = _matched_names_by_captions
+    for new_name, old_name in _matched_names_by_content.items():
+        if new_name in _matched_names and _matched_names[new_name] != old_name:
+            print(f"Multiple matches for table {path}/{new_name}: {_matched_names[new_name]} by caption and {old_name} by content")
+        else:
+            _matched_names[new_name] = old_name
+    return [Table.from_file(path, m, annotations, match_name=_matched_names.get(m["filename"]), guessed_tags=_guessed_tags.get(m["filename"])) for m in metadata]