paperswithcode
diff --git a/‎extract_tables.py
Lines changed: 56 additions & 14 deletions b/‎extract_tables.py
Lines changed: 56 additions & 14 deletions
diff --git a/‎sota_extractor2/data/doc_utils.py
Lines changed: 196 additions & 30 deletions b/‎sota_extractor2/data/doc_utils.py
Lines changed: 196 additions & 30 deletions
@@ -166,19 +166,60 @@ def move_out_references(table):
         wrap_elem_content(anchor, f"<ref id='{fix_id(anchor['href'][1:])}'>", "</ref>")
 
 
-#def move_out_text_styles(table):
-#    ltx_font = 'ltx_font_'
-#    font_selector = f'[class*="{ltx_font}"]'
-#
-#    for elem in table.select(f"span{font_selector}, a{font_selector}, em{font_selector}"):
-#        for c in set(elem.attrs["class"]):
-#            if c == ltx_font + 'bold':
-#                wrap_elem_content(elem, "<b>", "</b>")
-#            elif c == ltx_font + 'italic':
-#                wrap_elem_content(elem, "<i>", "</i>")
-
-
-def move_out_styles(table):
+bold_font_weight_re = re.compile(r"(^|;)\s*font-weight:\s*(bold|700|800|900)\s*(;|$)")
+bold_mathjax_font_re = re.compile(r"^MJXc-TeX-\w*-BI?$")
+italic_font_style_re = re.compile(r"(^|;)\s*font-style:\s*italic\s*(;|$)")
+italic_mathjax_font_re = re.compile(r"^MJXc-TeX-\w*-B?I$")
+
+def _has_font_class(classes, font_re):
+    return any(font_re.match(cls) for cls in classes)
+
+
+font_color_re = re.compile(r"(^|;)\s*color:\s*(?P<color>#[0-9A-Fa-f]{3,6}|red|green|blue)\s*(;|$)")
+def _extract_color_from_style(style):
+    m = font_color_re.search(style)
+    if m:
+        color = m["color"]
+        if color[0] == "#":
+            color = color[1:]
+            if len(color) != 6:
+                color = (color + color)[:6]
+            r, g, b = int(color[0:2], 16), int(color[2:4], 16), int(color[4:6], 16)
+            if r > 2 * g and r > 2 * b:
+                color = "red"
+            elif g > 2 * r and g > 2 * b:
+                color = "green"
+            elif b > 2 * r and b > 2 * g:
+                color = "blue"
+            else:
+                return
+        return color
+    return
+
+
+def move_out_text_styles(table):
+    for elem in table.select('.ltx_font_bold, [style*="font-weight"], [class*="MJXc-TeX-"]'):
+        classes = elem.get("class", [])
+        style   = elem.get("style", "")
+        if "ltx_font_bold" in classes or bold_font_weight_re.search(style) \
+                or _has_font_class(classes, bold_mathjax_font_re):
+                    wrap_elem_content(elem, "<bold>", "</bold>")
+
+    for elem in table.select('.ltx_font_italic, [style*="font-style"], [class*="MJXc-TeX-"]'):
+        classes = elem.get("class", [])
+        style   = elem.get("style", "")
+        if "ltx_font_italic" in classes or italic_font_style_re.search(style) \
+                or _has_font_class(classes, italic_mathjax_font_re):
+                    wrap_elem_content(elem, "<italic>", "</italic>")
+
+    for elem in table.select('[style*="color"]'):
+        style = elem.get("style")
+        color = _extract_color_from_style(style)
+        if color:
+            wrap_elem_content(elem, f"<{color}>", f"</{color}>")
+
+
+def move_out_cell_styles(table):
     ltx_border = 'ltx_border_'
     ltx_align = 'ltx_align_'
     ltx_th = 'ltx_th'
@@ -312,7 +353,8 @@ def extract_tables(filename, outdir):
             continue
         remove_footnotes(table)
         move_out_references(table)
-        move_out_styles(table)
+        move_out_text_styles(table)
+        move_out_cell_styles(table)
         escape_table_content(table)
         tab = html2data(table)
         if tab is None:
 
@@ -1,23 +1,100 @@
 import re
-from bs4 import BeautifulSoup, Comment, Tag
+from bs4 import BeautifulSoup, Comment, Tag, NavigableString
 import codecs
 
 def _handle_reference(el):
     if el.get('href', "").startswith("#"):
         r = str(el.get('href'))
         el.clear()  # to remove it's content from the descendants iterator
-        return "xxref-" + r[1:]
+        return "xxref-" + _simplify_anchor(r[1:])
+
+
+_anchor_like_classes = {
+        'ltx_appendix', 'ltx_bibliography', 'ltx_figure', 'ltx_float', 'ltx_graphics', 'ltx_note',
+        'ltx_paragraph', 'ltx_picture', 'ltx_section', 'ltx_subsection', 'ltx_subsubsection', 'ltx_theorem',
+        'ltx_title_section', 'ltx_title_subsection'
+}
+
+def _insert_anchor(el, anchor_id, prefix="xxanchor"):
+    el.insert(0, NavigableString(f' {prefix}-{anchor_id} '))
+
+def put_dummy_anchors(soup):
+    for elem in soup.select(
+            '.ltx_appendix, .ltx_bibliography, .ltx_bibitem, ' + \
+            '.ltx_figure, .ltx_float, ' + \
+            '.ltx_picture, .ltx_theorem'):
+        id_str = elem.get('id', '')
+        if id_str:
+            _insert_anchor(elem, _simplify_anchor(id_str))
+    for elem in soup.select('h2, h3, h4, h5, h6'):
+        sec = elem.find_parent("section")
+        if sec:
+            id_str = sec.get('id')
+            if id_str:
+                _insert_anchor(elem, _simplify_anchor(id_str))
+    for elem in soup.select(".ltx_table"):
+        id_str = elem.get('id', "xxunk")
+        _insert_anchor(elem, _simplify_anchor(id_str), "xxtable-xxanchor")
+    for elem in soup.select(".ltx_tabular"):
+        elem.extract()
+
+    for elem in soup.select('a[href^="#"]'):
+        r = str(elem.get('href'))
+        elem.string = "xxref-" + _simplify_anchor(r[1:])
+
+    put_footnote_anchors(soup)
+
+def put_footnote_anchors(soup):
+    for elem in soup.select('.ltx_note_content > .ltx_note_mark'):
+        elem.extract()
+
+    for elem in soup.select('.ltx_role_footnote > .ltx_note_mark'):
+        ft = elem.parent
+        id_str = ft.get('id')
+        elem.string = f" xxref-{_simplify_anchor(id_str)} "
+
+    for elem in soup.select('.ltx_note_content > .ltx_tag_note'):
+        ft = elem.find_parent(class_="ltx_role_footnote")
+        if ft:
+            id_str = ft.get('id')
+            elem.string = f" xxanchor-{_simplify_anchor(id_str)} "
+
+# remove . from latexml ids (f.e., S2.SS5) so they can be searched for in elastic
+# without disambiguations
+def _simplify_anchor(s):
+    return s.replace('.', '')
 
 
 def _handle_anchor(el):
-    if el.get('id', ""):
+    if el.name.lower() == 'a' and el.get('id', ""):
         id_str = el.get('id', "")
         el.clear()  # to remove it's content from the descendants iterator
         return "xxanchor-" + id_str
+#    classes = get_classes(el)
+#    id_str = el.get('id')
+#    if 'ltx_title_section' in classes  or 'ltx_title_subsection' in classes:
+#        print(el.get_text())
+#    print(el.name)
+#    if 'ltx_title_section' in classes or 'ltx_title_subsection' in classes:
+#        print(el.get_text())
+#        # this is workaround to deal with differences between
+#        # htlatex and latexml html structure
+#        # it would be better to make use of latexml structure
+#        sec = el.find_parent("section")
+#        if sec:
+#            id_str = sec.get('id')
+#            print(id_str, el.get_text())
+#
+#    if id_str and classes:
+#        classes = set(classes)
+#        if classes.intersection(_anchor_like_classes):
+#            print('xxanchor-'+id_str)
+#            el.clear()  # to remove it's content from the descendants iterator
+#            return "xxanchor-" + id_str
 
 
 def _handle_table(el):
-    if el.name.lower() == 'table':
+    if 'ltx_table' in get_classes(el):
         id_str = el.get('id', "xxunk")
         el.clear()  # to remove it's content from the descendants iterator
         return f"xxtable-xxanchor-" + id_str
@@ -32,26 +109,31 @@ def _handle_table(el):
 
 def transform(el):
     if isinstance(el, Tag):
-        for f in _transforms_el:
-            r = f(el)
-            if r is not None:
-                return transform(r)
+#        for f in _transforms_el:
+#            r = f(el)
+#            if r is not None:
+#                return transform(r)
+        return el.get_text()
     elif not isinstance(el, Comment):
         return str(el)
     return ''
 
 
+def clean_abstract(t):
+    return re.sub("^\s*[aA]bstract ?", "", t)
+
+
 def get_text(*els):
-    t = " ".join([transform(t)
-                  for el in els for t in getattr(el, 'descendants', [el])])
-    t = re.sub("^[aA]bstract ?", "", t)
+#    t = " ".join([transform(t)
+#                  for el in els for t in getattr(el, 'descendants', [el])])
+    t = " ".join([transform(e) for e in els])
     t = re.sub("[ \n\xa0]+", " ", t)
     t = re.sub("[;,()]* (#[A-Za-z0-9]+) [;,()]*", r" \1 ", t)
     t = re.sub(r" (#[A-Za-z0-9]+) *\1 ", r" \1 ", t)
     return t.strip()
 
 
-def content_in_section(header, names=['h3', 'h4'], skip_comments=True):
+def content_in_section(header, names=['h2', 'h3'], skip_comments=True):
     for el in header.next_siblings:
         if getattr(el, 'name', '') in names:
             break
@@ -60,26 +142,25 @@ def content_in_section(header, names=['h3', 'h4'], skip_comments=True):
         yield el
 
 
-def get_class(el):
+def get_classes(el):
     if hasattr(el, 'get'):
-        # fixme: less convoluted way to return '' if calss is not found
-        return (el.get('class', [''])+[''])[0]
+        return el.get('class', [])
     else:
-        return ''
+        return []
 
 
 def get_name(el):
     return hasattr(el, 'name') and el.name or ''
 
 
 def _group_bibliography(el):
-    if get_class(el) == 'thebibliography':
-        return [get_text(i) for i in el.select('p.bibitem')]
+    if 'ltx_bibliography' in get_classes(el):
+        return [get_text(i) for i in el.select('li.ltx_bibitem')]
     return []
 
 
 def _group_table(el):
-    if get_class(el) == 'table':
+    if 'ltx_table' in get_classes(el):
         return [get_text(el)]
     return []
 
@@ -92,7 +173,7 @@ def __init__(self):
     def collect(self, el):
         if get_name(el) == 'table':
             self.join_next_p = True
-        elif get_name(el) == "p":
+        elif 'ltx_para' in get_classes(el):
             if self.join_next_p:
                 self.join_next_p = False
                 self.els.append(el)
@@ -123,7 +204,7 @@ def reset(self):
 ]
 
 
-def group_content(elements):
+def group_content2(elements):
     par_gruop = ParagraphGrouper()
     for el in elements:
         fragments = [frag for grouper in _group_el for frag in grouper(el)]
@@ -138,15 +219,100 @@ def group_content(elements):
         yield frag
 
 
-def set_ids_by_labels(soup):
-    captions = soup.select(".caption")
-    prefix = "tex4ht:label?:"
-    for caption in captions:
-        el = caption.next_sibling
-        if isinstance(el, Comment) and el.string.startswith(prefix):
-            label = el.string[len(prefix):].strip()
-            for table in caption.parent.select("table"):
-                table["id"] = label
+def walk(elem):
+    for el in elem.children:
+        classes = get_classes(el)
+        if el.name == 'section' or 'ltx_biblist' in classes:
+            yield from walk(el)
+        else:
+            yield el
+
+class Grouper:
+    def __init__(self):
+        self.out = []
+        self.section_idx = -1
+        self.subsection_idx = 0
+        self.header = ""
+        self.in_section = False # move elements before first section into that section
+        self.section_output = False # if a section is empty and new section begins, output it for keep header
+
+    def get_output_text(self):
+        return " ".join(self.out)
+
+    def flush(self):
+        if self.in_section:
+            r = max(self.section_idx, 0), self.subsection_idx, self.header, self.get_output_text()
+            self.out = []
+            self.section_output = True
+            self.subsection_idx += 1
+            yield r
+
+    def new_section(self, header_el):
+        if not self.section_output: # output (possibly) empty section so header won't be lost
+            yield from self.flush()
+        self.section_output = False
+        self.in_section = True
+        self.section_idx += 1
+        self.subsection_idx = 0
+        self.header = get_text(header_el)
+
+    def append(self, el):
+        t = get_text(el).strip()
+        if t != "":
+            self.out.append(t)
+
+    def group_content(self, doc):
+        for el in walk(doc):
+            classes = get_classes(el)
+            if el.name in ["h2", "h3"]:
+                yield from self.new_section(el)
+            elif el.name == "h1":
+                continue
+            elif 'ltx_para' in classes or el.name == "figure" or 'ltx_bibitem' in classes:
+                self.append(el)
+                yield from self.flush()
+            else:
+                self.append(el)
+
+
+def group_content(doc):
+    yield from Grouper().group_content(doc)
+
+def group_content3(doc):
+    out = []
+    section_idx = -1
+    subsection_idx = 0
+    header = ""
+    has_paragraph = False
+    for el in walk(doc):
+        classes = get_classes(el)
+        if el.name in ["h2", "h3"]:
+            if len(out) and has_paragraph:
+                yield (max(section_idx, 0), subsection_idx, header, " ".join([get_text(o) for o in out]))
+                out = []
+            section_idx += 1
+            subsection_idx = 0
+            header = get_text(el)
+            continue
+        elif 'ltx_title' in classes and el.name != "h1":
+            if len(out) and has_paragraph:
+                yield (max(section_idx, 0), subsection_idx, header, " ".join([get_text(o) for o in out]))
+                out = []
+            out += [el]
+
+        elif 'ltx_title_document' in classes:
+            continue
+        elif 'ltx_para' in classes or el.name == "figure" or 'ltx_bibitem' in classes:
+            if len(out) and has_paragraph:
+                yield (max(section_idx, 0), subsection_idx, header, " ".join([get_text(o) for o in out]))
+                subsection_idx += 1
+                out = []
+            has_paragraph = True
+            out += [el]
+        else:
+            out.append(el)
+    if len(out):
+        yield (max(section_idx, 0), subsection_idx, header, " ".join([get_text(o) for o in out]))
 
 def read_html(file):
     with codecs.open(file, 'r', encoding='UTF-8') as f: