Separate html parsing from elastic

mkardas · mkardas · commit c68d12f670ab · 2019-07-01T06:04:50.000+02:00
diff --git a/normalize_references.py b/normalize_references.py
@@ -5,7 +5,7 @@
 import ahocorasick
 import pickle
 from multiprocessing import Pool
-from sota_extractor2.data.elastic import get_text, Paper
+from sota_extractor2.data.doc_utils import get_text, read_html
 
 punctuation_table = str.maketrans('', '', string.punctuation)
 
@@ -62,7 +62,7 @@ def save_html(path, html):
 def resolve_references_in_html(args):
     file, output = args
     output.parent.mkdir(exist_ok=True, parents=True)
-    html = Paper.read_html(f)
+    html = read_html(file)
     bibitems = get_bibitems(html)
     mapping = resolve_references(reference_trie, bibitems)
     update_references(html, mapping)
diff --git a/sota_extractor2/data/doc_utils.py b/sota_extractor2/data/doc_utils.py
@@ -0,0 +1,154 @@
+import re
+from bs4 import BeautifulSoup, Comment, Tag
+import codecs
+
+def _handle_reference(el):
+    if el.get('href', "").startswith("#"):
+        r = str(el.get('href'))
+        el.clear()  # to remove it's content from the descendants iterator
+        return "xxref-" + r[1:]
+
+
+def _handle_anchor(el):
+    if el.get('id', ""):
+        id_str = el.get('id', "")
+        el.clear()  # to remove it's content from the descendants iterator
+        return "xxanchor-" + id_str
+
+
+def _handle_table(el):
+    if el.name.lower() == 'table':
+        id_str = el.get('id', "xxunk")
+        el.clear()  # to remove it's content from the descendants iterator
+        return f"xxtable-xxanchor-" + id_str
+
+
+_transforms_el = [
+    _handle_reference,
+    _handle_table,
+    _handle_anchor,
+]
+
+
+def transform(el):
+    if isinstance(el, Tag):
+        for f in _transforms_el:
+            r = f(el)
+            if r is not None:
+                return transform(r)
+    elif not isinstance(el, Comment):
+        return str(el)
+    return ''
+
+
+def get_text(*els):
+    t = " ".join([transform(t)
+                  for el in els for t in getattr(el, 'descendants', [el])])
+    t = re.sub("^[aA]bstract ?", "", t)
+    t = re.sub("[ \n\xa0]+", " ", t)
+    t = re.sub("[;,()]* (#[A-Za-z0-9]+) [;,()]*", r" \1 ", t)
+    t = re.sub(r" (#[A-Za-z0-9]+) *\1 ", r" \1 ", t)
+    return t.strip()
+
+
+def content_in_section(header, names=['h3', 'h4'], skip_comments=True):
+    for el in header.next_siblings:
+        if getattr(el, 'name', '') in names:
+            break
+        if skip_comments and isinstance(el, Comment):
+            continue
+        yield el
+
+
+def get_class(el):
+    if hasattr(el, 'get'):
+        # fixme: less convoluted way to return '' if calss is not found
+        return (el.get('class', [''])+[''])[0]
+    else:
+        return ''
+
+
+def get_name(el):
+    return hasattr(el, 'name') and el.name or ''
+
+
+def _group_bibliography(el):
+    if get_class(el) == 'thebibliography':
+        return [get_text(i) for i in el.select('p.bibitem')]
+    return []
+
+
+def _group_table(el):
+    if get_class(el) == 'table':
+        return [get_text(el)]
+    return []
+
+
+class ParagraphGrouper:
+    def __init__(self):
+        self.els = []
+        self.join_next_p = False
+
+    def collect(self, el):
+        if get_name(el) == 'table':
+            self.join_next_p = True
+        elif get_name(el) == "p":
+            if self.join_next_p:
+                self.join_next_p = False
+                self.els.append(el)
+            else:
+                return self.flush(new_els=[el])
+        else:
+            self.els.append(el)
+        return []
+
+    def flush(self, new_els=None):
+        text = get_text(*self.els)
+        if new_els is None:
+            new_els = []
+        if isinstance(new_els, Tag):  # allow for one tag to be passed
+            new_els = [new_els]
+        self.els = new_els
+        if text:
+            return [text]
+        return []
+
+    def reset(self):
+        self.els = []
+
+
+_group_el = [
+    _group_bibliography,
+    _group_table,
+]
+
+
+def group_content(elements):
+    par_gruop = ParagraphGrouper()
+    for el in elements:
+        fragments = [frag for grouper in _group_el for frag in grouper(el)]
+        if fragments:
+            fragments = par_gruop.flush() + fragments
+        else:
+            fragments = par_gruop.collect(el)
+        for frag in fragments:
+            yield frag
+
+    for frag in par_gruop.flush():
+        yield frag
+
+
+def set_ids_by_labels(soup):
+    captions = soup.select(".caption")
+    prefix = "tex4ht:label?:"
+    for caption in captions:
+        el = caption.next_sibling
+        if isinstance(el, Comment) and el.string.startswith(prefix):
+            label = el.string[len(prefix):].strip()
+            for table in caption.parent.select("table"):
+                table["id"] = label
+
+def read_html(file):
+    with codecs.open(file, 'r', encoding='UTF-8') as f:
+        text = f.read()
+    return BeautifulSoup(text, "html.parser")
diff --git a/sota_extractor2/data/elastic.py b/sota_extractor2/data/elastic.py
@@ -1,19 +1,15 @@
 import pandas as pd
 import re
-import numpy as np
-import elasticsearch
-from bs4 import BeautifulSoup, Comment, Tag
-import codecs
-import textwrap
+from bs4 import BeautifulSoup
 
-from datetime import datetime
-from elasticsearch_dsl import Document, Date, Nested, Boolean, Object, \
-    analyzer, InnerDoc, Completion, Keyword, Text, Integer, tokenizer, token_filter
+from elasticsearch_dsl import Document, Boolean, Object, \
+    analyzer, InnerDoc, Keyword, Text, Integer, tokenizer, token_filter
 
-from IPython.display import display, Markdown, Latex
+from IPython.display import display, Markdown
 
 from elasticsearch_dsl import connections
 
+from sota_extractor2.data.doc_utils import get_text, content_in_section, group_content, set_ids_by_labels, read_html
 from .. import config
 
 
@@ -26,153 +22,6 @@ def printmd(*args):  # fixme: make it work without jupyter notebook
     display(Markdown(" ".join(map(str, args))))
 
 
-def _handle_reference(el):
-    if el.get('href', "").startswith("#"):
-        r = str(el.get('href'))
-        el.clear()  # to remove it's content from the descendants iterator
-        return "xxref-" + r[1:]
-
-
-def _handle_anchor(el):
-    if el.get('id', ""):
-        id_str = el.get('id', "")
-        el.clear()  # to remove it's content from the descendants iterator
-        return "xxanchor-" + id_str
-
-
-def _handle_table(el):
-    if el.name.lower() == 'table':
-        id_str = el.get('id', "xxunk")
-        el.clear()  # to remove it's content from the descendants iterator
-        return f"xxtable-xxanchor-" + id_str
-
-
-_transforms_el = [
-    _handle_reference,
-    _handle_table,
-    _handle_anchor,
-]
-
-
-def transform(el):
-    if isinstance(el, Tag):
-        for f in _transforms_el:
-            r = f(el)
-            if r is not None:
-                return transform(r)
-    elif not isinstance(el, Comment):
-        return str(el)
-    return ''
-
-
-def get_text(*els):
-    t = " ".join([transform(t)
-                  for el in els for t in getattr(el, 'descendants', [el])])
-    t = re.sub("^[aA]bstract ?", "", t)
-    t = re.sub("[ \n\xa0]+", " ", t)
-    t = re.sub("[;,()]* (#[A-Za-z0-9]+) [;,()]*", r" \1 ", t)
-    t = re.sub(r" (#[A-Za-z0-9]+) *\1 ", r" \1 ", t)
-    return t.strip()
-
-
-def content_in_section(header, names=['h3', 'h4'], skip_comments=True):
-    for el in header.next_siblings:
-        if getattr(el, 'name', '') in names:
-            break
-        if skip_comments and isinstance(el, Comment):
-            continue
-        yield el
-
-
-def get_class(el):
-    if hasattr(el, 'get'):
-        # fixme: less convoluted way to return '' if calss is not found
-        return (el.get('class', [''])+[''])[0]
-    else:
-        return ''
-
-
-def get_name(el):
-    return hasattr(el, 'name') and el.name or ''
-
-
-def _group_bibliography(el):
-    if get_class(el) == 'thebibliography':
-        return [get_text(i) for i in el.select('p.bibitem')]
-    return []
-
-
-def _group_table(el):
-    if get_class(el) == 'table':
-        return [get_text(el)]
-    return []
-
-
-class ParagraphGrouper:
-    def __init__(self):
-        self.els = []
-        self.join_next_p = False
-
-    def collect(self, el):
-        if get_name(el) == 'table':
-            self.join_next_p = True
-        elif get_name(el) == "p":
-            if self.join_next_p:
-                self.join_next_p = False
-                self.els.append(el)
-            else:
-                return self.flush(new_els=[el])
-        else:
-            self.els.append(el)
-        return []
-
-    def flush(self, new_els=None):
-        text = get_text(*self.els)
-        if new_els is None:
-            new_els = []
-        if isinstance(new_els, Tag):  # allow for one tag to be passed
-            new_els = [new_els]
-        self.els = new_els
-        if text:
-            return [text]
-        return []
-
-    def reset(self):
-        self.els = []
-
-
-_group_el = [
-    _group_bibliography,
-    _group_table,
-]
-
-
-def group_content(elements):
-    par_gruop = ParagraphGrouper()
-    for el in elements:
-        fragments = [frag for grouper in _group_el for frag in grouper(el)]
-        if fragments:
-            fragments = par_gruop.flush() + fragments
-        else:
-            fragments = par_gruop.collect(el)
-        for frag in fragments:
-            yield frag
-
-    for frag in par_gruop.flush():
-        yield frag
-
-
-def set_ids_by_labels(soup):
-    captions = soup.select(".caption")
-    prefix = "tex4ht:label?:"
-    for caption in captions:
-        el = caption.next_sibling
-        if isinstance(el, Comment) and el.string.startswith(prefix):
-            label = el.string[len(prefix):].strip()
-            for table in caption.parent.select("table"):
-                table["id"] = label
-
-
 class Fragments(list):
 
     def get_toc(self):
@@ -335,9 +184,7 @@ def print_section(self, name, clean_up=lambda x: x):
 
     @classmethod
     def read_html(cls, file):
-        with codecs.open(file, 'r', encoding='UTF-8') as f:
-            text = f.read()
-        return BeautifulSoup(text, "html.parser")
+        return read_html(file)
 
     @classmethod
     def parse_paper(cls, file):