1
+ from bs4 import BeautifulSoup , Tag
2
+ from docutils import nodes
3
+ import os .path
4
+
5
+ known_start_tags = ['p' ,'img' ,'table' ]
6
+
7
+ def html2Docutils (app , doctree ):
8
+ #find all raw nodes
9
+ filepath = doctree ['source' ]
10
+ for node in doctree .traverse (nodes .raw ):
11
+ soup = BeautifulSoup (node .astext (),features = "html.parser" )
12
+ if soup .find ().name in known_start_tags :
13
+ #send it to converter
14
+ div = nodes .container ()
15
+ convertHTML (soup , div , app , filepath )
16
+ parent = node .parent
17
+ parent .replace (node ,div )
18
+ #replace raw node with output of converter
19
+ #child = nodes.Text("poof")
20
+ #node[0]=child
21
+
22
+ def convertHTML (soup , parent , app , filepath ):
23
+ if hasattr (soup ,"children" ):
24
+ for child in soup .children :
25
+ node = None
26
+ if child .name is not None :
27
+ if child .name == "table" :
28
+ if filepath not in app .env .config .sphinx_md_tableIDs :
29
+ app .env .config .sphinx_md_tableIDs ['filepath' ]= 0
30
+ else :
31
+ app .evn .config .sphinx_md_tableIDs ['filepath' ] += 1
32
+ tNode = nodes .table ()
33
+ tNode ['ids' ].append ("id" + str (app .env .config .sphinx_md_tableIDs ['filepath' ]))
34
+ titleNode = nodes .title ()
35
+ node = nodes .tgroup ()
36
+ ncols = getNumCols (child )
37
+ node ['cols' ]= ncols
38
+ for x in range (ncols ):
39
+ colspecNode = nodes .colspec ()
40
+ colspecNode ["colwidth" ]= 1
41
+ node += colspecNode
42
+ tNode += titleNode
43
+ tNode += node
44
+ parent += tNode
45
+ elif child .name == "p" :
46
+ node = nodes .paragraph ()
47
+ parent += node
48
+ elif child .name == "img" :
49
+ node = nodes .image ()
50
+ imgPath = ""
51
+ if "alt" in child .attrs :
52
+ node ["alt" ]= child .attrs ['alt' ]
53
+ if "src" in child .attrs :
54
+ basepath = app .env .srcdir + "/"
55
+ docfilename = os .path .splitext (os .path .relpath (filepath ,basepath ))[0 ]
56
+ relpath = os .path .dirname (os .path .relpath (filepath ,basepath ))
57
+ imgPath = os .path .join (relpath ,child .attrs ['src' ])
58
+ node ["uri" ]= imgPath
59
+ print ("Checking for file." )
60
+ if os .path .isfile (imgPath ):
61
+ if imgPath not in app .env .images :
62
+ imageFileName = os .path .basename (imgPath )
63
+ imageTuple = ({docfilename },imageFileName )
64
+ app .env .images [imgPath ]= imageTuple
65
+ if "width" in child .attrs :
66
+ node ["width" ]= child .attrs ['width' ]
67
+ if "height" in child .attrs :
68
+ node ["height" ]= child .attrs ['height' ]
69
+ node ["candidates" ]= "{'*': '" + imgPath + "'}"
70
+ parent += node
71
+ elif child .name == "thead" :
72
+ node = nodes .thead ()
73
+ parent += node
74
+ elif child .name == "tbody" :
75
+ node = nodes .tbody ()
76
+ parent += node
77
+ elif child .name == "tr" :
78
+ node = nodes .row ()
79
+ parent += node
80
+ elif child .name == "th" or child .name == "td" :
81
+ node = nodes .entry ()
82
+ parent += node
83
+ else :
84
+ if isinstance (parent , nodes .entry ) or isinstance (parent , nodes .paragraph ) or isinstance (parent , nodes .image ):
85
+ node = nodes .Text (child )
86
+ parent += node
87
+ if node :
88
+ convertHTML (child ,node ,app ,filepath )
89
+ # soup = removeHTMLAttributes(soup,"table")
90
+ # soup = removeHTMLAttributes(soup,"p")
91
+ # soup = replaceTag(soup,"tr","row")
92
+ # soup = replaceTag(soup,"th","entry")
93
+ # soup = replaceTag(soup,"td","entry")
94
+ # soup = addTGroup(soup)
95
+ # soup = fixImages(soup)
96
+ # ditaXML = str(soup)
97
+
98
+ def removeHTMLAttributes (soup ,tagName ):
99
+ tags = soup .find_all (tagName )
100
+ for tag in tags :
101
+ attList = []
102
+ for attr in tag .attrs :
103
+ attList .append (attr )
104
+ for att in attList :
105
+ del tag [att ]
106
+ return soup
107
+
108
+ def replaceTag (soup ,oldTag ,newTag ,delAttrs = True ):
109
+ tags = soup .find_all (oldTag )
110
+ for tag in tags :
111
+ tag .name = newTag
112
+ if delAttrs :
113
+ attList = []
114
+ for attr in tag .attrs :
115
+ attList .append (attr )
116
+ for att in attList :
117
+ del tag [att ]
118
+ return soup
119
+
120
+ def fixImages (soup ):
121
+ imgTags = soup .find_all ('img' )
122
+ for imgTag in imgTags :
123
+ altTag = soup .new_tag ("alt" )
124
+ imgTag .name = "image"
125
+ imgTag ['href' ]= imgTag ['src' ]
126
+ del imgTag ['src' ]
127
+ altTag .string = imgTag ['alt' ]
128
+ del imgTag ['alt' ]
129
+ imgTag .append (altTag )
130
+ return soup
131
+
132
+
133
+ def addTGroup (soup ):
134
+ numCols = getNumCols (soup )
135
+ tags = soup .find_all ('table' )
136
+ for tableTag in tags :
137
+ tableTag .name = 'tgroup'
138
+ tableTag ['cols' ]= numCols
139
+ del tableTag ['class' ]
140
+ wrap (tableTag ,soup .new_tag ("table" ))
141
+ return soup
142
+
143
+ def wrap (to_wrap , wrap_in ):
144
+ contents = to_wrap .replace_with (wrap_in )
145
+ wrap_in .append (contents )
146
+
147
+ def getNumCols (soup ):
148
+ rows = soup .find_all ('th' )
149
+ return len (rows )
0 commit comments