|
1 | | -import urllib |
2 | | -import urlparse |
| 1 | +from urllib.request import urlopen, urlretrieve |
| 2 | +from urllib.parse import urlparse, urlunparse |
3 | 3 | import re |
4 | 4 | import os |
5 | 5 | from time import sleep |
6 | 6 |
|
| 7 | + |
7 | 8 | class MoinSpider: |
8 | | - def __init__(self,site='fox.vincefn.net', |
9 | | - exclude=["RecentChanges","action=", |
10 | | - "FindPage","TitleIndex","WordIndex", |
11 | | - "Help","template","Template","MoinMoin", |
12 | | - "UserPreferences","WikiSandBox", |
13 | | - "ScriptAlias","ScriptAlias"]): |
14 | | - self.u=urllib.URLopener() |
15 | | - #self.u.addheader(('USER_AGENT', 'Mozilla/4.0')) |
16 | | - self.base='href=\"/' |
17 | | - self.suffix="?action=print" |
18 | | - self.site=site |
19 | | - self.pages=[] # list of pairs [relative URL, page content] |
20 | | - self.d={} # dictionnary with keys=relative URL, value= short filename for the downloaded page |
21 | | - self.exclude=exclude |
22 | | - self.nbFail=0 # pages which failed to load |
23 | | - self.img=set() |
24 | | - def Weave(self, lnk='/Fox/FoxWiki',nbtry=3): |
25 | | - """ Download recursively all pages, starting from one relative URL. |
26 | | - """ |
27 | | - if self.d.has_key(lnk): # we already got that page ! |
28 | | - return |
29 | | - self.d[lnk]="wiki_%i.html"%(1000+len(self.d)) |
30 | | - url="http://"+self.site+lnk+self.suffix #:TODO: use urlparse ! |
31 | | - print() |
32 | | - print("Getting page: %s"%url) |
33 | | - print(" -> %s"%(self.d[lnk])) |
34 | | - nb=nbtry |
35 | | - cont=True |
36 | | - while(nb>0): |
37 | | - try: |
38 | | - p=self.u.open(url) |
39 | | - page=p.read() |
40 | | - nb=-1 |
41 | | - except IOError: |
42 | | - nb-=1 |
43 | | - print("IOError..... retry #%i"%(nbtry-nb)) |
44 | | - sleep(1) |
45 | | - if nb==0: |
46 | | - print("Failed to load page, after %i trials:"%nbtry,lnk) |
47 | | - self.nbFail+=1 |
48 | | - return |
49 | | - if re.search("This page does not exist yet",page)!=None: |
50 | | - print(" -> Page has not been written yet !") |
51 | | - self.d[lnk]="http://"+self.site+lnk # Link directly to site |
52 | | - return |
53 | | - self.pages.append([lnk,page]) |
54 | | - for m in re.finditer(r"href\=\"(.*?)\"",page): |
55 | | - newlink=m.group() |
56 | | - if len(newlink)>=len(self.base): |
57 | | - if newlink[:len(self.base)]==self.base: |
58 | | - keep=True |
59 | | - for x in self.exclude: |
60 | | - if re.search(x,newlink)!= None: |
61 | | - keep=False |
62 | | - break |
63 | | - if keep: |
64 | | - #print(" ->%s"%newlink) |
65 | | - newlink=newlink[6:-1]# [6:-1] -> exlude ' href=" ' and the end ' " ' |
66 | | - newlink=re.split('#',newlink)[0] # exclude anchors |
67 | | - self.Weave(newlink) |
68 | | - #else: |
69 | | - # print(" ->%s ? NO"%newlink) |
| 9 | + def __init__(self, site='fox.vincefn.net', |
| 10 | + exclude=["RecentChanges", "action=", |
| 11 | + "FindPage", "TitleIndex", "WordIndex", |
| 12 | + "Help", "template", "Template", "MoinMoin", |
| 13 | + "UserPreferences", "WikiSandBox", |
| 14 | + "ScriptAlias", "ScriptAlias"]): |
| 15 | + # self.u.addheader(('USER_AGENT', 'Mozilla/4.0')) |
| 16 | + self.base = 'href=\"/' |
| 17 | + self.suffix = "?action=print" |
| 18 | + self.site = site |
| 19 | + self.pages = [] # list of pairs [relative URL, page content] |
| 20 | + self.d = {} # dictionnary with keys=relative URL, value= short filename for the downloaded page |
| 21 | + self.exclude = exclude |
| 22 | + self.nbFail = 0 # pages which failed to load |
| 23 | + self.img = set() |
70 | 24 |
|
71 | | - def WeaveStatic(self, pagelist,nbtry=3): |
72 | | - """ Alternative to weave: download a pre-selected list of pages |
73 | | - """ |
74 | | - for lnk in pagelist: |
75 | | - self.d[lnk]="wiki_%i.html"%(1000+len(self.d)) |
76 | | - url="http://"+self.site+lnk+self.suffix #:TODO: use urlparse ! |
77 | | - print("Getting page: %s -> %s"%(url,self.d[lnk])) |
78 | | - nb=nbtry |
79 | | - cont=True |
80 | | - while(nb>0): |
| 25 | + def Weave(self, lnk='/Fox/FoxWiki', nbtry=3): |
| 26 | + """ Download recursively all pages, starting from one relative URL. |
| 27 | + """ |
| 28 | + if self.d.has_key(lnk): # we already got that page ! |
| 29 | + return |
| 30 | + self.d[lnk] = "wiki_%i.html" % (1000 + len(self.d)) |
| 31 | + url = "http://" + self.site + lnk + self.suffix #:TODO: use urlparse ! |
| 32 | + print() |
| 33 | + print("Getting page: %s" % url) |
| 34 | + print(" -> %s" % (self.d[lnk])) |
| 35 | + nb = nbtry |
| 36 | + cont = True |
| 37 | + while (nb > 0): |
81 | 38 | try: |
82 | | - print(url) |
83 | | - p=self.u.open(url) |
84 | | - page=p.read() |
85 | | - nb=-1 |
| 39 | + p = urlopen(url) |
| 40 | + page = p.read().decode('utf-8') |
| 41 | + nb = -1 |
86 | 42 | except IOError: |
87 | | - nb-=1 |
88 | | - print("IOError..... retry #%i"%(nbtry-nb)) |
89 | | - sleep(1) |
90 | | - if nb==0: |
91 | | - print("Failed to load page, after %i trials:"%nbtry,lnk) |
92 | | - if re.search("This page does not exist yet",page)!=None: |
93 | | - print(" -> Page has not been written yet !") |
94 | | - self.d[lnk]="http://"+self.site+lnk # Link directly to site |
95 | | - nb=0 |
96 | | - else: |
97 | | - self.pages.append([lnk,page]) |
| 43 | + nb -= 1 |
| 44 | + print("IOError..... retry #%i" % (nbtry - nb)) |
| 45 | + sleep(1) |
| 46 | + if nb == 0: |
| 47 | + print("Failed to load page, after %i trials:" % nbtry, lnk) |
| 48 | + self.nbFail += 1 |
| 49 | + return |
| 50 | + if re.search("This page does not exist yet", page) is not None: |
| 51 | + print(" -> Page has not been written yet !") |
| 52 | + self.d[lnk] = "http://" + self.site + lnk # Link directly to site |
| 53 | + return |
| 54 | + self.pages.append([lnk, page]) |
| 55 | + for m in re.finditer(r"href\=\"(.*?)\"", page): |
| 56 | + newlink = m.group() |
| 57 | + if len(newlink) >= len(self.base): |
| 58 | + if newlink[:len(self.base)] == self.base: |
| 59 | + keep = True |
| 60 | + for x in self.exclude: |
| 61 | + if re.search(x, newlink) != None: |
| 62 | + keep = False |
| 63 | + break |
| 64 | + if keep: |
| 65 | + # print(" ->%s"%newlink) |
| 66 | + newlink = newlink[6:-1] # [6:-1] -> exlude ' href=" ' and the end ' " ' |
| 67 | + newlink = re.split('#', newlink)[0] # exclude anchors |
| 68 | + self.Weave(newlink) |
| 69 | + # else: |
| 70 | + # print(" ->%s ? NO"%newlink) |
| 71 | + |
| 72 | + def WeaveStatic(self, pagelist, nbtry=3): |
| 73 | + """ Alternative to weave: download a pre-selected list of pages |
| 74 | + """ |
| 75 | + for lnk in pagelist: |
| 76 | + self.d[lnk] = "wiki_%i.html" % (1000 + len(self.d)) |
| 77 | + url = "http://" + self.site + lnk + self.suffix #:TODO: use urlparse ! |
| 78 | + print("Getting page: %s -> %s" % (url, self.d[lnk])) |
| 79 | + nb = nbtry |
| 80 | + cont = True |
| 81 | + while (nb > 0): |
| 82 | + try: |
| 83 | + print(url) |
| 84 | + p = urlopen(url) |
| 85 | + page = p.read().decode('utf-8') |
| 86 | + nb = -1 |
| 87 | + except IOError: |
| 88 | + nb -= 1 |
| 89 | + print("IOError..... retry #%i" % (nbtry - nb)) |
| 90 | + sleep(1) |
| 91 | + if nb == 0: |
| 92 | + print("Failed to load page, after %i trials:" % nbtry, lnk) |
| 93 | + if re.search("This page does not exist yet", page) is not None: |
| 94 | + print(" -> Page has not been written yet !") |
| 95 | + self.d[lnk] = "http://" + self.site + lnk # Link directly to site |
| 96 | + nb = 0 |
| 97 | + else: |
| 98 | + self.pages.append([lnk, page]) |
| 99 | + |
| 100 | + def Pages2Html(self, d="wikihtml"): |
| 101 | + # TODO : remove links to non-written pages |
| 102 | + if not os.path.exists(d): |
| 103 | + os.mkdir(d) |
| 104 | + # this is necessary so that urls that contain other (smaller) urls |
| 105 | + # are replaced first |
| 106 | + ks = list(self.d.keys()) |
| 107 | + ks.sort(reverse=True) |
| 108 | + for p in self.pages: |
| 109 | + for m in re.finditer(r"img .*? src\=\"(.*?)\"", p[1]): |
| 110 | + print(re.findall(r"src\=\"(.*?)\"", m.group())) |
| 111 | + url = re.findall(r"src\=\"(.*?)\"", m.group())[0] |
| 112 | + up = urlparse(url) |
| 113 | + print(url) |
| 114 | + up0, up1, up2, up3, up4, up5 = up[0], up[1], up[2], up[3], up[4], up[5] |
| 115 | + if up4 != '': |
| 116 | + name = re.split('=', up4).pop() |
| 117 | + else: |
| 118 | + name = re.split('/', up2).pop() |
| 119 | + if name not in self.img: # download image once |
| 120 | + self.img.add(name) |
| 121 | + if up0 == '': |
| 122 | + up0 = 'http' |
| 123 | + if up1 == '': |
| 124 | + up1 = self.site |
| 125 | + urlimg = urlunparse((up0, up1, up2, up3, up4, up5)).replace('&', '&') |
| 126 | + print(" %s -> %s" % (urlimg, name)) |
| 127 | + nbTry = 3 |
| 128 | + nb = nbTry |
| 129 | + while nb > 0: |
| 130 | + try: |
| 131 | + urlretrieve(urlimg, d + "/" + name) |
| 132 | + nb = -1 |
| 133 | + except IOError: |
| 134 | + nb -= 1 |
| 135 | + print("IOError..... retry #%i to get %s" % (nbTry - nb, name)) |
| 136 | + sleep(1) |
| 137 | + if nb == 0: |
| 138 | + print("Failed to load image, after %i trials: %s" % (nbtry, name)) |
| 139 | + else: # KLUDGE png->png cause htmldoc chokes on these... |
| 140 | + if name[-4:] == ".png": |
| 141 | + print("convert %s %s" % (d + "/" + name, d + "/" + name[:-3] + "jpg")) |
| 142 | + os.system("convert %s %s" % (d + "/" + name, d + "/" + name[:-3] + "jpg")) |
| 143 | + os.system("rm -f %s" % (d + "/" + name)) |
| 144 | + p[1] = p[1].replace(url, name) |
| 145 | + for k in ks: # change to local url |
| 146 | + if k != self.d[k]: |
| 147 | + p[1] = p[1].replace(k, self.d[k]) |
| 148 | + # Change src field of img from "wiki_1002.html?action=AttachFile&do=get&target=toto.jpg" |
| 149 | + # to "toto.jpg" |
| 150 | + p[1] = p[1].replace("%s?action=AttachFile&do=get&target=" % k, "") |
| 151 | + p[1] = p[1].replace(".png", ".jpg") |
| 152 | + f = open(d + "/" + self.d[p[0]], 'w') |
| 153 | + f.write(p[1]) |
98 | 154 |
|
99 | | - def Pages2Html(self,d="wikihtml"): |
100 | | - #TODO : remove links to non-written pages |
101 | | - if not os.path.exists(d): |
102 | | - os.mkdir(d) |
103 | | - #this is necessary so that urls that contain other (smaller) urls |
104 | | - #are replaced first |
105 | | - ks=self.d.keys() |
106 | | - ks.sort(reverse=True) |
107 | | - for p in self.pages: |
108 | | - for m in re.finditer(r"img .*? src\=\"(.*?)\"",p[1]): |
109 | | - print(re.findall(r"src\=\"(.*?)\"",m.group())) |
110 | | - url=re.findall(r"src\=\"(.*?)\"",m.group())[0] |
111 | | - up=urlparse.urlparse(url) |
112 | | - print(url) |
113 | | - up0,up1,up2,up3,up4,up5=up[0],up[1],up[2],up[3],up[4],up[5] |
114 | | - if up4 != '': |
115 | | - name=re.split('=',up4).pop() |
116 | | - else: |
117 | | - name=re.split('/',up2).pop() |
118 | | - if name not in self.img:#download image once |
119 | | - self.img.add(name) |
120 | | - if up0=='': |
121 | | - up0='http' |
122 | | - if up1=='': |
123 | | - up1=self.site |
124 | | - urlimg=urlparse.urlunparse((up0,up1,up2,up3,up4,up5)).replace('&','&') |
125 | | - print(" %s -> %s"%(urlimg,name)) |
126 | | - nbTry=3 |
127 | | - nb=nbTry |
128 | | - while nb>0: |
129 | | - try: |
130 | | - urllib.urlretrieve(urlimg,d+"/"+name) |
131 | | - nb=-1 |
132 | | - except IOError: |
133 | | - nb-=1 |
134 | | - print("IOError..... retry #%i to get %s"%(nbTry-nb,name)) |
135 | | - sleep(1) |
136 | | - if nb==0: |
137 | | - print("Failed to load image, after %i trials: %s"%(nbtry,name)) |
138 | | - else: # KLUDGE png->png cause htmldoc chokes on these... |
139 | | - if name[-4:]==".png": |
140 | | - print("convert %s %s"%(d+"/"+name,d+"/"+name[:-3]+"jpg")) |
141 | | - os.system("convert %s %s"%(d+"/"+name,d+"/"+name[:-3]+"jpg")) |
142 | | - os.system("rm -f %s"%(d+"/"+name)) |
143 | | - p[1]=p[1].replace(url,name) |
144 | | - for k in ks:# change to local url |
145 | | - if k!=self.d[k]: |
146 | | - p[1]=p[1].replace(k,self.d[k]) |
147 | | - # Change src field of img from "wiki_1002.html?action=AttachFile&do=get&target=toto.jpg" to "toto.jpg" |
148 | | - p[1]=p[1].replace("%s?action=AttachFile&do=get&target="%k,"") |
149 | | - p[1]=p[1].replace(".png",".jpg") |
150 | | - f=open(d+"/"+self.d[p[0]],'w') |
151 | | - f.write(p[1]) |
152 | | - def Html2pdf(self,d="wikihtml"): |
153 | | - os.system("mogrify -resize '600x>' wikihtml/*.jpg") |
154 | | - #os.system("htmldoc --jpeg=85 --webpage %s/*.html --linkcolor blue -f wiki.pdf"%d) |
155 | | - os.system("htmldoc --jpeg=85 --webpage %s/*.html --linkcolor blue --size a4 --format pdf14 --links --book --toclevels 3 --left 1.5cm --right 1.5cm --top 1.5cm --bottom 1.5cm --footer Dc1 -f FoxManual.pdf"%d) |
156 | | - #os.system("rm -f wikihtml/*") |
| 155 | + def Html2pdf(self, d="wikihtml"): |
| 156 | + os.system("mogrify -resize '600x>' wikihtml/*.jpg") |
| 157 | + # os.system("htmldoc --jpeg=85 --webpage %s/*.html --linkcolor blue -f wiki.pdf"%d) |
| 158 | + os.system("htmldoc --jpeg=85 --webpage %s/*.html --linkcolor blue --size a4 --format pdf14 " |
| 159 | + "--links --book --toclevels 3 --left 1.5cm --right 1.5cm --top 1.5cm --bottom 1.5cm " |
| 160 | + "--footer Dc1 -f FoxManual.pdf" % d) |
| 161 | + # os.system("rm -f wikihtml/*") |
157 | 162 |
|
158 | | -#m=MoinSpider(site="objcryst.sourceforge.net") |
159 | | -m=MoinSpider(site="fox.vincefn.net") |
| 163 | + |
| 164 | +# m=MoinSpider(site="objcryst.sourceforge.net") |
| 165 | +m = MoinSpider(site="fox.vincefn.net") |
160 | 166 |
|
161 | 167 | m.WeaveStatic(["/FoxWiki", |
162 | 168 | "/BiblioReferences", |
@@ -185,11 +191,10 @@ def Html2pdf(self,d="wikihtml"): |
185 | 191 | "/FoxCompile", |
186 | 192 | "/Compile/Linux", |
187 | 193 | "/Compile/MacOSX", |
188 | | - #"/Compile/Windows" |
189 | | - #"/BiblioStructures", |
190 | | - #"/VincentFavreNicolin" |
| 194 | + # "/Compile/Windows" |
| 195 | + # "/BiblioStructures", |
| 196 | + # "/VincentFavreNicolin" |
191 | 197 | ]) |
192 | 198 |
|
193 | | - |
194 | 199 | m.Pages2Html() |
195 | 200 | m.Html2pdf() |
0 commit comments