3
3
import os .path
4
4
5
5
known_start_tags = ['p' ,'img' ,'table' ]
6
+ hidden_tag = 'hidden'
6
7
7
8
def html2Docutils (app , doctree ):
8
9
#find all raw nodes
10
+ if not app .env .config .sphinx_md_processRaw :
11
+ return
9
12
filepath = doctree ['source' ]
13
+ htmlCounter = 0
10
14
for node in doctree .traverse (nodes .raw ):
11
15
soup = BeautifulSoup (node .astext (),features = "html.parser" )
12
- if soup .find ().name in known_start_tags :
13
- #send it to converter
14
- div = nodes .container ()
15
- convertHTML (soup , div , app , filepath )
16
- parent = node .parent
17
- parent .replace (node ,div )
18
- #replace raw node with output of converter
19
- #child = nodes.Text("poof")
20
- #node[0]=child
16
+ if soup .find ():
17
+ if soup .find ().name in known_start_tags :
18
+ #send it to converter
19
+ div = nodes .container ()
20
+ div ['id' ]= 'html-content-' + str (htmlCounter )
21
+ htmlCounter += 1
22
+ convertHTML (soup , div , app , filepath )
23
+ parent = node .parent
24
+ parent .replace (node ,div .children )
25
+ #replace raw node with output of converter
26
+ #child = nodes.Text("poof")
27
+ #node[0]=child
28
+ elif soup .find ().name == hidden_tag :
29
+ hidden_comment = nodes .comment ()
30
+ comment_text = nodes .Text ("hidden" )
31
+ hidden_comment .append (comment_text )
32
+ parent = node .parent
33
+ parent .replace (node ,hidden_comment )
21
34
22
35
def convertHTML (soup , parent , app , filepath ):
23
36
if hasattr (soup ,"children" ):
24
37
for child in soup .children :
25
38
node = None
26
- if child .name is not None :
39
+ if hasattr ( child , "name" ) and child .name is not None :
27
40
if child .name == "table" :
28
41
if filepath not in app .env .config .sphinx_md_tableIDs :
29
42
app .env .config .sphinx_md_tableIDs ['filepath' ]= 0
@@ -51,19 +64,24 @@ def convertHTML(soup, parent, app, filepath):
51
64
if "alt" in child .attrs :
52
65
node ["alt" ]= child .attrs ['alt' ]
53
66
if "src" in child .attrs :
54
- basepath = app .env .srcdir + "/"
55
- docfilename = os .path .splitext (os .path .relpath (filepath ,basepath ))[0 ]
56
- relpath = os .path .dirname (os .path .relpath (filepath ,basepath ))
57
- imgPath = os .path .join (relpath ,child .attrs ['src' ])
58
- node ["uri" ]= imgPath
59
- print ("Checking for file." )
60
- if os .path .isfile (imgPath ):
61
- if imgPath not in app .env .images :
62
- imageFileName = os .path .basename (imgPath )
63
- imageTuple = ({docfilename },imageFileName )
64
- app .env .images [imgPath ]= imageTuple
67
+ if "https" in child .attrs ['src' ]:
68
+ node ["uri" ]= child .attrs ['src' ]
69
+ else :
70
+ basepath = app .env .srcdir + "/"
71
+ docfilename = os .path .splitext (os .path .relpath (filepath ,basepath ))[0 ]
72
+ relpath = os .path .dirname (os .path .relpath (filepath ,basepath ))
73
+ imgPath = os .path .join (relpath ,child .attrs ['src' ])
74
+ node ["uri" ]= imgPath
75
+ if os .path .isfile (imgPath ):
76
+ if imgPath not in app .env .images :
77
+ imageFileName = os .path .basename (imgPath )
78
+ imageTuple = ({docfilename },imageFileName )
79
+ app .env .images [imgPath ]= imageTuple
65
80
if "width" in child .attrs :
66
- node ["width" ]= child .attrs ['width' ]
81
+ suffix = ''
82
+ if child .attrs ['width' ].isnumeric ():
83
+ suffix = 'px'
84
+ node ["width" ]= child .attrs ['width' ] + suffix
67
85
if "height" in child .attrs :
68
86
node ["height" ]= child .attrs ['height' ]
69
87
node ["candidates" ]= "{'*': '" + imgPath + "'}"
@@ -78,22 +96,27 @@ def convertHTML(soup, parent, app, filepath):
78
96
node = nodes .row ()
79
97
parent += node
80
98
elif child .name == "th" or child .name == "td" :
81
- node = nodes .entry ()
99
+ eNode = nodes .entry ()
100
+ node = nodes .paragraph ()
101
+ eNode += node
102
+ parent += eNode
103
+ elif child .name == "sup" :
104
+ node = nodes .superscript ()
105
+ parent += node
106
+ elif child .name == "a" :
107
+ node = nodes .reference ()
108
+ node ["refuri" ] = child .attrs ['href' ]
109
+ parent += node
110
+ elif child .name == "code" :
111
+ node = nodes .literal ()
82
112
parent += node
83
113
else :
84
- if isinstance (parent , nodes .entry ) or isinstance (parent , nodes .paragraph ) or isinstance (parent , nodes .image ):
114
+ if isinstance (parent ,nodes .Node ):
115
+ #if isinstance(parent, nodes.entry) or isinstance(parent, nodes.paragraph) or isinstance(parent, nodes.image) or isinstance(parent, nodes.superscript) or isinstance(parent, nodes.reference) or isinstance(parent, nodes.literal):
85
116
node = nodes .Text (child )
86
117
parent += node
87
118
if node :
88
119
convertHTML (child ,node ,app ,filepath )
89
- # soup = removeHTMLAttributes(soup,"table")
90
- # soup = removeHTMLAttributes(soup,"p")
91
- # soup = replaceTag(soup,"tr","row")
92
- # soup = replaceTag(soup,"th","entry")
93
- # soup = replaceTag(soup,"td","entry")
94
- # soup = addTGroup(soup)
95
- # soup = fixImages(soup)
96
- # ditaXML = str(soup)
97
120
98
121
def removeHTMLAttributes (soup ,tagName ):
99
122
tags = soup .find_all (tagName )
0 commit comments