1
+ from bs4 import BeautifulSoup , Tag
2
+ from docutils import nodes
3
+ import os .path
4
+
5
+ known_start_tags = ['p' ,'img' ,'table' ]
6
+ hidden_tag = 'hidden'
7
+
8
+ def html2Docutils (app , doctree ):
9
+ #find all raw nodes
10
+ if not app .env .config .sphinx_md_processRaw :
11
+ return
12
+ filepath = doctree ['source' ]
13
+ htmlCounter = 0
14
+ for node in doctree .traverse (nodes .raw ):
15
+ soup = BeautifulSoup (node .astext (),features = "html.parser" )
16
+ if soup .find ():
17
+ if soup .find ().name in known_start_tags :
18
+ #send it to converter
19
+ div = nodes .container ()
20
+ div ['id' ]= 'html-content-' + str (htmlCounter )
21
+ htmlCounter += 1
22
+ convertHTML (soup , div , app , filepath )
23
+ parent = node .parent
24
+ parent .replace (node ,div .children )
25
+ #replace raw node with output of converter
26
+ #child = nodes.Text("poof")
27
+ #node[0]=child
28
+ elif soup .find ().name == hidden_tag :
29
+ hidden_comment = nodes .comment ()
30
+ comment_text = nodes .Text ("hidden" )
31
+ hidden_comment .append (comment_text )
32
+ parent = node .parent
33
+ parent .replace (node ,hidden_comment )
34
+
35
+ def convertHTML (soup , parent , app , filepath ):
36
+ if hasattr (soup ,"children" ):
37
+ for child in soup .children :
38
+ node = None
39
+ if hasattr (child ,"name" ) and child .name is not None :
40
+ if child .name == "table" :
41
+ if filepath not in app .env .config .sphinx_md_tableIDs :
42
+ app .env .config .sphinx_md_tableIDs ['filepath' ]= 0
43
+ else :
44
+ app .evn .config .sphinx_md_tableIDs ['filepath' ] += 1
45
+ tNode = nodes .table ()
46
+ tNode ['ids' ].append ("id" + str (app .env .config .sphinx_md_tableIDs ['filepath' ]))
47
+ titleNode = nodes .title ()
48
+ node = nodes .tgroup ()
49
+ ncols = getNumCols (child )
50
+ node ['cols' ]= ncols
51
+ for x in range (ncols ):
52
+ colspecNode = nodes .colspec ()
53
+ colspecNode ["colwidth" ]= 1
54
+ node += colspecNode
55
+ tNode += titleNode
56
+ tNode += node
57
+ parent += tNode
58
+ elif child .name == "p" :
59
+ node = nodes .paragraph ()
60
+ parent += node
61
+ elif child .name == "img" :
62
+ node = nodes .image ()
63
+ imgPath = ""
64
+ if "alt" in child .attrs :
65
+ node ["alt" ]= child .attrs ['alt' ]
66
+ if "src" in child .attrs :
67
+ if "https" in child .attrs ['src' ]:
68
+ node ["uri" ]= child .attrs ['src' ]
69
+ else :
70
+ basepath = app .env .srcdir + "/"
71
+ docfilename = os .path .splitext (os .path .relpath (filepath ,basepath ))[0 ]
72
+ relpath = os .path .dirname (os .path .relpath (filepath ,basepath ))
73
+ imgPath = os .path .join (relpath ,child .attrs ['src' ])
74
+ node ["uri" ]= imgPath
75
+ if os .path .isfile (imgPath ):
76
+ if imgPath not in app .env .images :
77
+ imageFileName = os .path .basename (imgPath )
78
+ imageTuple = ({docfilename },imageFileName )
79
+ app .env .images [imgPath ]= imageTuple
80
+ if "width" in child .attrs :
81
+ suffix = ''
82
+ if child .attrs ['width' ].isnumeric ():
83
+ suffix = 'px'
84
+ node ["width" ]= child .attrs ['width' ] + suffix
85
+ if "height" in child .attrs :
86
+ node ["height" ]= child .attrs ['height' ]
87
+ node ["candidates" ]= "{'*': '" + imgPath + "'}"
88
+ parent += node
89
+ elif child .name == "thead" :
90
+ node = nodes .thead ()
91
+ parent += node
92
+ elif child .name == "tbody" :
93
+ node = nodes .tbody ()
94
+ parent += node
95
+ elif child .name == "tr" :
96
+ node = nodes .row ()
97
+ parent += node
98
+ elif child .name == "th" or child .name == "td" :
99
+ eNode = nodes .entry ()
100
+ node = nodes .paragraph ()
101
+ eNode += node
102
+ parent += eNode
103
+ elif child .name == "sup" :
104
+ node = nodes .superscript ()
105
+ parent += node
106
+ elif child .name == "a" :
107
+ node = nodes .reference ()
108
+ node ["refuri" ] = child .attrs ['href' ]
109
+ parent += node
110
+ elif child .name == "code" :
111
+ node = nodes .literal ()
112
+ parent += node
113
+ else :
114
+ if isinstance (parent ,nodes .Node ):
115
+ #if isinstance(parent, nodes.entry) or isinstance(parent, nodes.paragraph) or isinstance(parent, nodes.image) or isinstance(parent, nodes.superscript) or isinstance(parent, nodes.reference) or isinstance(parent, nodes.literal):
116
+ node = nodes .Text (child )
117
+ parent += node
118
+ if node :
119
+ convertHTML (child ,node ,app ,filepath )
120
+
121
+ def removeHTMLAttributes (soup ,tagName ):
122
+ tags = soup .find_all (tagName )
123
+ for tag in tags :
124
+ attList = []
125
+ for attr in tag .attrs :
126
+ attList .append (attr )
127
+ for att in attList :
128
+ del tag [att ]
129
+ return soup
130
+
131
+ def replaceTag (soup ,oldTag ,newTag ,delAttrs = True ):
132
+ tags = soup .find_all (oldTag )
133
+ for tag in tags :
134
+ tag .name = newTag
135
+ if delAttrs :
136
+ attList = []
137
+ for attr in tag .attrs :
138
+ attList .append (attr )
139
+ for att in attList :
140
+ del tag [att ]
141
+ return soup
142
+
143
+ def fixImages (soup ):
144
+ imgTags = soup .find_all ('img' )
145
+ for imgTag in imgTags :
146
+ altTag = soup .new_tag ("alt" )
147
+ imgTag .name = "image"
148
+ imgTag ['href' ]= imgTag ['src' ]
149
+ del imgTag ['src' ]
150
+ altTag .string = imgTag ['alt' ]
151
+ del imgTag ['alt' ]
152
+ imgTag .append (altTag )
153
+ return soup
154
+
155
+
156
+ def addTGroup (soup ):
157
+ numCols = getNumCols (soup )
158
+ tags = soup .find_all ('table' )
159
+ for tableTag in tags :
160
+ tableTag .name = 'tgroup'
161
+ tableTag ['cols' ]= numCols
162
+ del tableTag ['class' ]
163
+ wrap (tableTag ,soup .new_tag ("table" ))
164
+ return soup
165
+
166
+ def wrap (to_wrap , wrap_in ):
167
+ contents = to_wrap .replace_with (wrap_in )
168
+ wrap_in .append (contents )
169
+
170
+ def getNumCols (soup ):
171
+ rows = soup .find_all ('th' )
172
+ return len (rows )
0 commit comments