This repository was archived by the owner on May 9, 2018. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathWallScraper.py
More file actions
151 lines (142 loc) · 5.61 KB
/
WallScraper.py
File metadata and controls
151 lines (142 loc) · 5.61 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
from HTMLParser import HTMLParser
from sys import exit
from base64 import standard_b64decode
import urllib2, re, os, time, threading, Queue
queue = Queue.Queue()
decode_queue = Queue.Queue()
class DownloadTracker():
"""Responsible for managing download calculations"""
download_success = 0
def track_downloads(self, file_count):
self.file_count = file_count
self.current_download_count = file_count
def total_downloads(self):
while True:
try:
self.download_count = int(input("How many wallpapers would you like to download? (Must be a multiple of 60): "))
if self.download_count % 60 == 0:
break
else:
print "Please a multiple of 60."
except:
print "Please pick a valid number."
class MyHTMLParser(HTMLParser):
"""Checks each href from global page and adds it to decode_queue if it is a wallpaper"""
def handle_starttag(self, tag, attrs):
for attr_name, attr_value in attrs:
if not attr_name == "href": continue
wallpaper_url = re.search(r"(?<=http://wallbase.cc/)wallpaper/[\d]+", attr_value)
if wallpaper_url:
decode_queue.put(attr_value)
class TestyClass(HTMLParser):
"""Selects for base64 encoded image from HTML, decodes it, adds it to queue"""
def handle_data(self, data):
base64_imgsrc = re.search(r"(?<=(%s))[\S]*(?=%s)" % (re.escape("'+B(\\'"), re.escape("\\')")), (repr(data)))
if base64_imgsrc:
queue.put(standard_b64decode(base64_imgsrc.group()))
class MyThreadedHTMLParser(threading.Thread, HTMLParser):
"""Passes HTML from URLs in decode_queue to TestyClass"""
def __init__(self, decode_queue):
threading.Thread.__init__(self)
self.decode_queue = decode_queue
def run(self):
try:
url = self.decode_queue.get(True)
html = open_url(url)
if html is not None:
parser2 = TestyClass()
parser2.feed(html)
except Queue.Empty:
print "decode_queue.get failed"
finally:
self.decode_queue.task_done()
class ThreadDownload(threading.Thread):
"""Download data from the URL in queue"""
def __init__(self, queue):
threading.Thread.__init__(self)
self.queue = queue
def run(self):
try:
img_url = self.queue.get()
img_data = urllib2.urlopen(img_url).read()
filename = re.search(r"(wallpaper)[\S]+", str(img_url))
output = open(str(filename.group()), 'wb')
output.write(img_data)
output.close()
# print "Wallpaper downaloaded to %s!" % (os.path.abspath(filename.group()))
DownloadTracker.download_success += 1
except IOError as e:
print "Failed to download URL: %s" % img_url
except urllib2.URLError, e:
if hasattr(e, 'reason'):
print '\nReason: %s \nURL: %s' % (e.reason, img_url)
elif hasattr(e, 'code'):
print '\nReason: %s \nURL: %s' % (e.reason, img_url)
else:
print '\nReason: %s \nURL: %s' % (e.reason, img_url)
except Queue.Empty: pass
finally:
self.queue.task_done()
def open_url(url):
"""Opens URL and returns html"""
try:
response = urllib2.urlopen(url)
html = response.read()
response.close()
return(html)
except urllib2.URLError, e:
if hasattr(e, 'reason'):
print '\nReason: %s \nURL: %s' % (e.reason, url)
elif hasattr(e, 'code'):
print '\nReason: %s \nURL: %s' % (e.reason, url)
else:
print '\nReason: %s \nURL: %s' % (e.reason, url)
def mkPicDir(file_path):
if not os.path.isdir(file_path):
try:
os.mkdir(file_path)
except OSError, e:
print "Directory could not be created."#, e.errno
exit()
def main():
file_path = '.\pics'
mkPicDir(file_path)
os.chdir(file_path)
start_time = time.time()
dt = DownloadTracker()
#Count files in ./pics which will be added to current_download_count in order to pick up where the user left off downloading
#The top list is dynamic so eventually we're going to be missing some wallpapers or requesting wallpapers that have already been downloaded.
#One possible solution would be an update function that splits the URLs (split('/')[-1]) from decode_queue, then check isfile in ./pics
files_in_file_path = (len([file for file in os.listdir('.') if os.path.isfile(file)]))
dt.track_downloads(files_in_file_path)
dt.total_downloads()
while True:
url = "http://wallbase.cc/toplist/" + str(dt.current_download_count) + "/23/gteq/1920x1080/1.77/110/60/0"
parser = MyHTMLParser()
html = open_url(url)
if html is None:
print "Main() 1st While: html is none"
continue
#First page displays 60 images, this will parse for the appropriate hrefs and add them to decode_queue
parser.feed(html)
#Until decode_queue is empty perform MyThreadedHTMLParser
for i in range(decode_queue.qsize()):
t = MyThreadedHTMLParser(decode_queue)
t.start()
decode_queue.join()
#print "decode_queue joined %.2f" % (time.time() - start_time)
#thread pool to download
for i in range(queue.qsize()):
td = ThreadDownload(queue)
td.start()
queue.join()
print "Batch finished: " % (time.time() - start_time)
dt.current_download_count += 60
print "Images downloaded: " + str(dt.download_success)
print "Images left to download: " + str((int(dt.download_count) + dt.file_count - dt.current_download_count))
if dt.current_download_count >= (int(dt.download_count) + dt.file_count):
break
if not dt.download_success == dt.download_count:
print "Images to retry, once I put in that feature: " + str(dt.download_count - dt.download_success)
print "%.2f" % (time.time() - start_time)
main()