Skip to content

Commit 711dd38

Browse files
Merge pull request #1 from intelkevinputnam/convert-html-tables-to-docutils
Convert some raw HTML objects to Docutils objects for publication to targets other than HTML
2 parents 20932ff + e702d8d commit 711dd38

File tree

3 files changed

+179
-2
lines changed

3 files changed

+179
-2
lines changed

setup.py

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,7 @@
55

66
setuptools.setup(
77
name="sphinx-md",
8-
version="0.0.2",
8+
version="0.0.3",
99
author="Kevin Putnam",
1010
author_email="[email protected]",
1111
description="Sphinx extension to use with Recommonmark to fix links to rst from md, links to md from rst, and links to embedded files and dirs.",
@@ -19,4 +19,5 @@
1919
"Operating System :: OS Independent",
2020
],
2121
python_requires='>=3.6',
22-
)
22+
install_requires=['recommonmark','sphinx-markdown-tables','bs4'],
23+
)

sphinx_md/__init__.py

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,7 @@
33
"""
44

55
import sphinx
6+
from .convert_html import html2Docutils
67

78
from docutils import nodes
89
from os.path import isdir, isfile, join, basename, dirname
@@ -153,6 +154,9 @@ def setup(app):
153154
app.add_config_value('sphinx_md_useGitHubURL',False,'')
154155
app.add_config_value('sphinx_md_githubFileURL','','')
155156
app.add_config_value('sphinx_md_githubDirURL','','')
157+
app.add_config_value('sphinx_md_processRaw',False,'')
158+
app.add_config_value('sphinx_md_tableIDs',{},'')
159+
app.connect('doctree-read',html2Docutils)
156160
app.connect('doctree-resolved',fixLocalMDAnchors)
157161
app.connect('missing-reference',fixRSTLinkInMD)
158162

sphinx_md/convert_html.py

Lines changed: 172 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,172 @@
1+
from bs4 import BeautifulSoup, Tag
2+
from docutils import nodes
3+
import os.path
4+
5+
known_start_tags = ['p','img','table']
6+
hidden_tag = 'hidden'
7+
8+
def html2Docutils(app, doctree):
9+
#find all raw nodes
10+
if not app.env.config.sphinx_md_processRaw:
11+
return
12+
filepath = doctree['source']
13+
htmlCounter = 0
14+
for node in doctree.traverse(nodes.raw):
15+
soup = BeautifulSoup(node.astext(),features="html.parser")
16+
if soup.find():
17+
if soup.find().name in known_start_tags:
18+
#send it to converter
19+
div = nodes.container()
20+
div['id']='html-content-' + str(htmlCounter)
21+
htmlCounter += 1
22+
convertHTML(soup, div, app, filepath)
23+
parent = node.parent
24+
parent.replace(node,div.children)
25+
#replace raw node with output of converter
26+
#child = nodes.Text("poof")
27+
#node[0]=child
28+
elif soup.find().name == hidden_tag:
29+
hidden_comment = nodes.comment()
30+
comment_text = nodes.Text("hidden")
31+
hidden_comment.append(comment_text)
32+
parent = node.parent
33+
parent.replace(node,hidden_comment)
34+
35+
def convertHTML(soup, parent, app, filepath):
36+
if hasattr(soup,"children"):
37+
for child in soup.children:
38+
node = None
39+
if hasattr(child,"name") and child.name is not None:
40+
if child.name == "table":
41+
if filepath not in app.env.config.sphinx_md_tableIDs:
42+
app.env.config.sphinx_md_tableIDs['filepath']=0
43+
else:
44+
app.evn.config.sphinx_md_tableIDs['filepath'] += 1
45+
tNode = nodes.table()
46+
tNode['ids'].append("id"+str(app.env.config.sphinx_md_tableIDs['filepath']))
47+
titleNode = nodes.title()
48+
node = nodes.tgroup()
49+
ncols = getNumCols(child)
50+
node['cols']= ncols
51+
for x in range(ncols):
52+
colspecNode = nodes.colspec()
53+
colspecNode["colwidth"]=1
54+
node += colspecNode
55+
tNode += titleNode
56+
tNode += node
57+
parent += tNode
58+
elif child.name == "p":
59+
node = nodes.paragraph()
60+
parent += node
61+
elif child.name == "img":
62+
node = nodes.image()
63+
imgPath = ""
64+
if "alt" in child.attrs:
65+
node["alt"]=child.attrs['alt']
66+
if "src" in child.attrs:
67+
if "https" in child.attrs['src']:
68+
node["uri"]=child.attrs['src']
69+
else:
70+
basepath = app.env.srcdir + "/"
71+
docfilename = os.path.splitext(os.path.relpath(filepath,basepath))[0]
72+
relpath = os.path.dirname(os.path.relpath(filepath,basepath))
73+
imgPath = os.path.join(relpath,child.attrs['src'])
74+
node["uri"]= imgPath
75+
if os.path.isfile(imgPath):
76+
if imgPath not in app.env.images:
77+
imageFileName = os.path.basename(imgPath)
78+
imageTuple = ({docfilename},imageFileName)
79+
app.env.images[imgPath]=imageTuple
80+
if "width" in child.attrs:
81+
suffix = ''
82+
if child.attrs['width'].isnumeric():
83+
suffix = 'px'
84+
node["width"]=child.attrs['width'] + suffix
85+
if "height" in child.attrs:
86+
node["height"]=child.attrs['height']
87+
node["candidates"]="{'*': '" + imgPath + "'}"
88+
parent += node
89+
elif child.name == "thead":
90+
node = nodes.thead()
91+
parent += node
92+
elif child.name == "tbody":
93+
node = nodes.tbody()
94+
parent += node
95+
elif child.name == "tr":
96+
node = nodes.row()
97+
parent += node
98+
elif child.name == "th" or child.name == "td":
99+
eNode = nodes.entry()
100+
node = nodes.paragraph()
101+
eNode += node
102+
parent += eNode
103+
elif child.name == "sup":
104+
node = nodes.superscript()
105+
parent += node
106+
elif child.name == "a":
107+
node = nodes.reference()
108+
node["refuri"] = child.attrs['href']
109+
parent += node
110+
elif child.name == "code":
111+
node = nodes.literal()
112+
parent += node
113+
else:
114+
if isinstance(parent,nodes.Node):
115+
#if isinstance(parent, nodes.entry) or isinstance(parent, nodes.paragraph) or isinstance(parent, nodes.image) or isinstance(parent, nodes.superscript) or isinstance(parent, nodes.reference) or isinstance(parent, nodes.literal):
116+
node = nodes.Text(child)
117+
parent += node
118+
if node:
119+
convertHTML(child,node,app,filepath)
120+
121+
def removeHTMLAttributes(soup,tagName):
122+
tags = soup.find_all(tagName)
123+
for tag in tags:
124+
attList = []
125+
for attr in tag.attrs:
126+
attList.append(attr)
127+
for att in attList:
128+
del tag[att]
129+
return soup
130+
131+
def replaceTag(soup,oldTag,newTag,delAttrs=True):
132+
tags = soup.find_all(oldTag)
133+
for tag in tags:
134+
tag.name = newTag
135+
if delAttrs:
136+
attList = []
137+
for attr in tag.attrs:
138+
attList.append(attr)
139+
for att in attList:
140+
del tag[att]
141+
return soup
142+
143+
def fixImages(soup):
144+
imgTags = soup.find_all('img')
145+
for imgTag in imgTags:
146+
altTag = soup.new_tag("alt")
147+
imgTag.name = "image"
148+
imgTag['href']=imgTag['src']
149+
del imgTag['src']
150+
altTag.string = imgTag['alt']
151+
del imgTag['alt']
152+
imgTag.append(altTag)
153+
return soup
154+
155+
156+
def addTGroup(soup):
157+
numCols = getNumCols(soup)
158+
tags = soup.find_all('table')
159+
for tableTag in tags:
160+
tableTag.name = 'tgroup'
161+
tableTag['cols']=numCols
162+
del tableTag['class']
163+
wrap(tableTag,soup.new_tag("table"))
164+
return soup
165+
166+
def wrap(to_wrap, wrap_in):
167+
contents = to_wrap.replace_with(wrap_in)
168+
wrap_in.append(contents)
169+
170+
def getNumCols(soup):
171+
rows = soup.find_all('th')
172+
return len(rows)

0 commit comments

Comments
 (0)