wallscrape/WallScraper.py at master · JonnyFb421/wallscrape · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
from HTMLParser import HTMLParser
from sys import exit
from base64 import standard_b64decode
import urllib2, re, os, time, threading, Queue

queue = Queue.Queue()
decode_queue = Queue.Queue()

class DownloadTracker():
  """Responsible for managing download calculations"""
  download_success = 0
  def track_downloads(self, file_count):
    self.file_count = file_count
    self.current_download_count = file_count
  def total_downloads(self):
    while True:
      try:
        self.download_count = int(input("How many wallpapers would you like to download? (Must be a multiple of 60): "))
        if self.download_count % 60 == 0:
          break
        else:
          print "Please a multiple of 60."
      except:
        print "Please pick a valid number."

class MyHTMLParser(HTMLParser):
  """Checks each href from global page and adds it to decode_queue if it is a wallpaper"""
  def handle_starttag(self, tag, attrs):
    for attr_name, attr_value in attrs:
      if not attr_name == "href": continue
      wallpaper_url = re.search(r"(?<=http://wallbase.cc/)wallpaper/[\d]+", attr_value)
      if wallpaper_url:
        decode_queue.put(attr_value)

class TestyClass(HTMLParser):
  """Selects for base64 encoded image from HTML, decodes it, adds it to queue"""
  def handle_data(self, data):
    base64_imgsrc = re.search(r"(?<=(%s))[\S]*(?=%s)" % (re.escape("'+B(\\'"), re.escape("\\')")), (repr(data)))
    if base64_imgsrc:
      queue.put(standard_b64decode(base64_imgsrc.group()))

class MyThreadedHTMLParser(threading.Thread, HTMLParser):
  """Passes HTML from URLs in decode_queue to TestyClass"""
  def __init__(self, decode_queue):
    threading.Thread.__init__(self)
    self.decode_queue = decode_queue
  def run(self):
    try:
      url = self.decode_queue.get(True)
      html = open_url(url)
      if html is not None:
        parser2 = TestyClass()
        parser2.feed(html)
    except Queue.Empty:
      print "decode_queue.get failed"
    finally:
      self.decode_queue.task_done()

class ThreadDownload(threading.Thread):
  """Download data from the URL in queue"""
  def __init__(self, queue):
    threading.Thread.__init__(self)
    self.queue = queue
  def run(self):
    try:
      img_url = self.queue.get()
      img_data = urllib2.urlopen(img_url).read()
      filename = re.search(r"(wallpaper)[\S]+", str(img_url))
      output = open(str(filename.group()), 'wb')
      output.write(img_data)
      output.close()
      # print "Wallpaper downaloaded to %s!" % (os.path.abspath(filename.group()))
      DownloadTracker.download_success += 1
    except IOError as e:
      print "Failed to download URL: %s" % img_url
    except urllib2.URLError, e:
      if hasattr(e, 'reason'):
        print '\nReason: %s \nURL: %s' % (e.reason, img_url)
      elif hasattr(e, 'code'):
        print '\nReason: %s \nURL: %s' % (e.reason, img_url)
      else:
        print '\nReason: %s \nURL: %s' % (e.reason, img_url)
    except Queue.Empty: pass
    finally:
      self.queue.task_done()

def open_url(url):
  """Opens URL and returns html"""
  try:
    response = urllib2.urlopen(url)
    html = response.read()
    response.close()
    return(html)
  except urllib2.URLError, e:
    if hasattr(e, 'reason'):
      print '\nReason: %s \nURL: %s' % (e.reason, url)
    elif hasattr(e, 'code'):
      print '\nReason: %s \nURL: %s' % (e.reason, url)
    else:
      print '\nReason: %s \nURL: %s' % (e.reason, url)

def mkPicDir(file_path):
  if not os.path.isdir(file_path):
    try:
      os.mkdir(file_path)
    except OSError, e:
      print "Directory could not be created."#, e.errno
      exit()

def main():
  file_path = '.\pics'
  mkPicDir(file_path)
  os.chdir(file_path)
  start_time = time.time()
  dt = DownloadTracker()
  #Count files in ./pics which will be added to current_download_count in order to pick up where the user left off downloading
  #The top list is dynamic so eventually we're going to be missing some wallpapers or requesting wallpapers that have already been downloaded.
  #One possible solution would be an update function that splits the URLs (split('/')[-1]) from decode_queue, then check isfile in ./pics
  files_in_file_path = (len([file for file in os.listdir('.') if os.path.isfile(file)]))
  dt.track_downloads(files_in_file_path)
  dt.total_downloads()
  while True:
    url = "http://wallbase.cc/toplist/" + str(dt.current_download_count) + "/23/gteq/1920x1080/1.77/110/60/0"
    parser = MyHTMLParser()
    html = open_url(url)
    if html is None:
      print "Main() 1st While: html is none"
      continue
    #First page displays 60 images, this will parse for the appropriate hrefs and add them to decode_queue
    parser.feed(html)
    #Until decode_queue is empty perform MyThreadedHTMLParser
    for i in range(decode_queue.qsize()):
      t = MyThreadedHTMLParser(decode_queue)
      t.start()
    decode_queue.join()
    #print "decode_queue joined %.2f" % (time.time() - start_time)
    #thread pool to download
    for i in range(queue.qsize()):
      td = ThreadDownload(queue)
      td.start()
    queue.join()
    print "Batch finished: " % (time.time() - start_time)
    dt.current_download_count += 60
    print "Images downloaded: " + str(dt.download_success)
    print "Images left to download: " + str((int(dt.download_count) + dt.file_count - dt.current_download_count))
    if dt.current_download_count >= (int(dt.download_count) + dt.file_count):
      break
  if not dt.download_success == dt.download_count:
    print "Images to retry, once I put in that feature: " + str(dt.download_count - dt.download_success)
  print "%.2f" % (time.time() - start_time)
main()