-
Notifications
You must be signed in to change notification settings - Fork 73
Expand file tree
/
Copy pathArchiveSearch.py
More file actions
104 lines (88 loc) · 2.83 KB
/
ArchiveSearch.py
File metadata and controls
104 lines (88 loc) · 2.83 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
# -*- coding: utf-8 -*-
"""
Author: Sparrow
Purpose: downloading one entire blog from Tumblr once.
Created: 2017-1.1
"""
import re
import urllib.request
import time
from urllib.parse import quote
def getHtml(url):
url = quote(url, safe='/:?=')
try:
page = urllib.request.urlopen(url)
html = page.read().decode('utf-8')
return html
except:
# traceback.print_exc()
print('The URL you requested could not be found')
return 'Html'
def ArchivePostfix(url):
URL = url + 'archive'
return URL
def findNextpage(ArchiveURL, url, PageList, PageNum):
html = getHtml(url)
reg = r'<a id="next_page_link" href="/archive(\?before_time=.*?)"'
regc = re.compile(reg)
nextpage = re.findall(regc, html)
if nextpage:
nextpageUrl = ArchiveURL + nextpage[0]
PageNum += 1
PageList[PageNum] = nextpageUrl
print('Page %s' % PageNum,nextpageUrl)
findNextpage(ArchiveURL, nextpageUrl, PageList, PageNum)
def findAllPage(url):
archiveURL = ArchivePostfix(url)
PageList = {1:archiveURL}
PageNum = 1
findNextpage(archiveURL, archiveURL, PageList, PageNum)
print('There is %s pages.' % len(PageList))
return PageList
def FindCurrentPagePostUrl(url):
html = getHtml(url)
reg = r'<a target="_blank" class="hover" title="" href="(.*?)"'
PostUrlre = re.compile(reg)
PostUrlString = re.findall(PostUrlre, html)
if PostUrlString:
PostUrl = []
for url in PostUrlString:
url = url.encode('gbk','ignore').decode('gbk')
PostUrl.append(url)
# print(PostUrl)
return PostUrl
else:
return False
def findalltheposturl(url):
PageList = findAllPage(url)
if PageList:
Pagenum = len(PageList)
PostUrlLists = {}
for page in range(1,Pagenum+1):
Posturl = FindCurrentPagePostUrl(PageList[page])
if Posturl:
PostUrlLists[page] = Posturl
try:
print(page,PostUrlLists[page],sep=' ')
except:
num = len(PostUrlLists[page])
for i in range(num):
url = PostUrlLists[page][i]
PostUrlLists[page][i] = url.encode('gbk', 'ignore').decode('gbk')
else:
print("There is no post in page %s!" % page)
# print(PostUrlLists,'mark')
return PostUrlLists
else:
print('There is no page!')
return False
if __name__ == '__main__':
select = 'N'
while not(select == 'Y'):
URL = input('Input url: ')
start = time.time()
findalltheposturl(URL)
# reCodeURL(URL)
end = time.time()
print(start, end, '=> Cost %ss' % (end - start))
select = input(" Do you want to Quit? [Y/N]")