Massive changes to table handling to allow for targets other than HTML.

intelkevinputnam · intelkevinputnam · commit a03fa73d2732 · 2021-05-07T16:44:08.000-07:00
Signed-off-by: Kevin Putnam &lt;kevin.putnam@intel.com&gt;
diff --git a/sphinx_md/__init__.py b/sphinx_md/__init__.py
@@ -3,6 +3,7 @@
 """
 
 import sphinx
+from .convert_html import html2Docutils
 
 from docutils import nodes
 from os.path import isdir, isfile, join, basename, dirname
@@ -153,6 +154,8 @@ def setup(app):
     app.add_config_value('sphinx_md_useGitHubURL',False,'')
     app.add_config_value('sphinx_md_githubFileURL','','')
     app.add_config_value('sphinx_md_githubDirURL','','')
+    app.add_config_value('sphinx_md_tableIDs',{},'')
+    app.connect('doctree-read',html2Docutils)
     app.connect('doctree-resolved',fixLocalMDAnchors)
     app.connect('missing-reference',fixRSTLinkInMD)
 
diff --git a/sphinx_md/convert_html.py b/sphinx_md/convert_html.py
@@ -0,0 +1,149 @@
+from bs4 import BeautifulSoup, Tag
+from docutils import nodes
+import os.path
+
+known_start_tags = ['p','img','table']
+
+def html2Docutils(app, doctree):
+    #find all raw nodes
+    filepath = doctree['source']
+    for node in doctree.traverse(nodes.raw):
+        soup = BeautifulSoup(node.astext(),features="html.parser")
+        if soup.find().name in known_start_tags:
+            #send it to converter
+            div = nodes.container()
+            convertHTML(soup, div, app, filepath)
+            parent = node.parent
+            parent.replace(node,div)
+            #replace raw node with output of converter
+            #child = nodes.Text("poof")
+            #node[0]=child
+
+def convertHTML(soup, parent, app, filepath):
+    if hasattr(soup,"children"):
+        for child in soup.children:
+            node = None
+            if child.name is not None:
+                if child.name == "table":
+                    if filepath not in app.env.config.sphinx_md_tableIDs:
+                        app.env.config.sphinx_md_tableIDs['filepath']=0
+                    else:
+                        app.evn.config.sphinx_md_tableIDs['filepath'] += 1
+                    tNode = nodes.table()
+                    tNode['ids'].append("id"+str(app.env.config.sphinx_md_tableIDs['filepath']))
+                    titleNode = nodes.title()
+                    node = nodes.tgroup()
+                    ncols = getNumCols(child)
+                    node['cols']= ncols
+                    for x in range(ncols):
+                        colspecNode = nodes.colspec()
+                        colspecNode["colwidth"]=1
+                        node += colspecNode
+                    tNode += titleNode
+                    tNode += node
+                    parent += tNode
+                elif child.name == "p":
+                    node = nodes.paragraph()
+                    parent += node
+                elif child.name == "img":
+                    node = nodes.image()
+                    imgPath = ""
+                    if "alt" in child.attrs:
+                        node["alt"]=child.attrs['alt']
+                    if "src" in child.attrs:
+                        basepath = app.env.srcdir + "/"
+                        docfilename = os.path.splitext(os.path.relpath(filepath,basepath))[0]
+                        relpath = os.path.dirname(os.path.relpath(filepath,basepath))
+                        imgPath = os.path.join(relpath,child.attrs['src'])
+                        node["uri"]= imgPath
+                        print("Checking for file.")
+                        if os.path.isfile(imgPath):
+                            if imgPath not in app.env.images:
+                                imageFileName = os.path.basename(imgPath)
+                                imageTuple = ({docfilename},imageFileName)
+                                app.env.images[imgPath]=imageTuple
+                    if "width" in child.attrs:
+                        node["width"]=child.attrs['width']
+                    if "height" in child.attrs:
+                        node["height"]=child.attrs['height']
+                    node["candidates"]="{'*': '" + imgPath + "'}"
+                    parent += node
+                elif child.name == "thead":
+                    node = nodes.thead()
+                    parent += node
+                elif child.name == "tbody":
+                    node = nodes.tbody()
+                    parent += node
+                elif child.name == "tr":
+                    node = nodes.row()
+                    parent += node
+                elif child.name == "th" or child.name == "td":
+                    node = nodes.entry()
+                    parent += node
+            else:
+                if isinstance(parent, nodes.entry) or isinstance(parent, nodes.paragraph) or isinstance(parent, nodes.image):
+                    node = nodes.Text(child)
+                    parent += node
+            if node:
+                convertHTML(child,node,app,filepath)
+    # soup = removeHTMLAttributes(soup,"table")
+    # soup = removeHTMLAttributes(soup,"p")
+    # soup = replaceTag(soup,"tr","row")
+    # soup = replaceTag(soup,"th","entry")
+    # soup = replaceTag(soup,"td","entry")
+    # soup = addTGroup(soup)
+    # soup = fixImages(soup)
+    # ditaXML = str(soup)
+
+def removeHTMLAttributes(soup,tagName):
+    tags = soup.find_all(tagName)
+    for tag in tags:
+        attList = []
+        for attr in tag.attrs:
+            attList.append(attr)
+        for att in attList:
+            del tag[att]
+    return soup
+
+def replaceTag(soup,oldTag,newTag,delAttrs=True):
+    tags = soup.find_all(oldTag)
+    for tag in tags:
+        tag.name = newTag
+        if delAttrs:
+            attList = []
+            for attr in tag.attrs:
+                attList.append(attr)
+            for att in attList:
+                del tag[att]
+    return soup
+
+def fixImages(soup):
+    imgTags = soup.find_all('img')
+    for imgTag in imgTags:
+        altTag = soup.new_tag("alt")
+        imgTag.name = "image"
+        imgTag['href']=imgTag['src']
+        del imgTag['src']
+        altTag.string = imgTag['alt']
+        del imgTag['alt']
+        imgTag.append(altTag)
+    return soup
+
+
+def addTGroup(soup):
+    numCols = getNumCols(soup)
+    tags = soup.find_all('table')
+    for tableTag in tags:
+        tableTag.name = 'tgroup'
+        tableTag['cols']=numCols
+        del tableTag['class']
+        wrap(tableTag,soup.new_tag("table"))
+    return soup
+
+def wrap(to_wrap, wrap_in):
+    contents = to_wrap.replace_with(wrap_in)
+    wrap_in.append(contents)
+
+def getNumCols(soup):
+    rows = soup.find_all('th')
+    return len(rows)