-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathH_picture_threads.py
More file actions
128 lines (92 loc) · 4.67 KB
/
H_picture_threads.py
File metadata and controls
128 lines (92 loc) · 4.67 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
searchKeyword = "福利"
##如搜尋結果有多個頁面,一個頁面一個執行緒
#Keyword:YUZUKI、小丁Cosplay、福利
web = "https://asiansister.com/"
searchPage_url = "https://asiansister.com/tag.php?tag=" + searchKeyword
UserAgent = "Chrome/77.0.3865.90"
def get_and_save_all_Hpicture(url):
import requests
from bs4 import BeautifulSoup
headers = {'User-Agent': UserAgent}
response = requests.session()
searchPage_source = response.get(url, headers=headers).text
searchPage_DOM = BeautifulSoup(searchPage_source, 'html.parser')
searchPage_itemBox_Tags = searchPage_DOM.select(".itemBox")
import os
import urllib.request
opener = urllib.request.build_opener()
opener.addheaders = [(UserAgent, 'Chrome/77.0.3865.90')]
for itemBox_Tag in searchPage_itemBox_Tags:
viewPage_uri = itemBox_Tag.a.get("href")
viewPage_url = web + viewPage_uri
print("正在進入..." + viewPage_url)
viewPage_source = response.get(viewPage_url, headers=headers).text
viewPage_DOM = BeautifulSoup(viewPage_source, 'html.parser')
image_Tags = viewPage_DOM.find_all("img", class_ = "lazyload showMiniImage")
for image_Tag in image_Tags:
image_uri_path = image_Tag.get("dataurl")[5:]
image_uri = image_uri_path.split('/')[-1]
image_url = web + image_uri_path
if "_t.jpg" in image_url: ## VIP picture
image_url = image_url[:-6] + image_url[-4:]
print(image_uri)
dir_path = './' + searchKeyword + '/' + viewPage_uri[5:]
savepath = './' + searchKeyword + '/' + viewPage_uri[5:] + '/' + image_uri
urllib.request.install_opener(opener)
try:
urllib.request.urlretrieve(image_url, savepath)
except FileNotFoundError:
os.makedirs(dir_path)
urllib.request.urlretrieve(image_url, savepath)
except urllib.error.HTTPError:
#找不到該原始圖片,抓取縮圖
print("404 Error... Can't find this picture origin size.")
urllib.request.urlretrieve(image_url[:-4] + '_t' + image_url[-4:], savepath)
import requests
from bs4 import BeautifulSoup
headers = {'User-Agent': UserAgent}
response = requests.session()
searchPage_source = response.get(searchPage_url, headers=headers).text
searchPage_DOM = BeautifulSoup(searchPage_source, 'html.parser')
searchPage_itemBox_Tags = searchPage_DOM.select(".itemBox")
if "btn page" in searchPage_source:
page_btns = searchPage_DOM.find_all("a", class_ = "btn page")
page_total = len(page_btns) + 1
import threading
threads = []
for i in range(page_total):
searchArgument = "&page=" + str(i+1)
searchPage_url = "https://asiansister.com/tag.php?tag=" + searchKeyword + searchArgument
print("===== 第 " + str(i+1) + " 頁: " + searchPage_url)
threads.append(threading.Thread(target = get_and_save_all_Hpicture, args = (searchPage_url,)))
for i in range(page_total):
threads[i].start()
for i in range(page_total):
threads[i].join()
else:
import os
import urllib.request
opener = urllib.request.build_opener()
opener.addheaders = [(UserAgent, 'Chrome/77.0.3865.90')]
for itemBox_Tag in searchPage_itemBox_Tags:
viewPage_uri = itemBox_Tag.a.get("href")
viewPage_url = web + viewPage_uri
print("正在進入..." + viewPage_url)
viewPage_source = response.get(viewPage_url, headers=headers).text
viewPage_DOM = BeautifulSoup(viewPage_source, 'html.parser')
image_Tags = viewPage_DOM.find_all("img", class_ = "lazyload showMiniImage")
for image_Tag in image_Tags:
image_uri_path = image_Tag.get("dataurl")[5:]
image_uri = image_uri_path.split('/')[-1]
image_url = web + image_uri_path
if "_t.jpg" in image_url: ## VIP picture
image_url = image_url[:-6] + image_url[-4:]
print(image_uri)
dir_path = './' + searchKeyword + '/' + viewPage_uri[5:]
savepath = './' + searchKeyword + '/' + viewPage_uri[5:] + '/' + image_uri
urllib.request.install_opener(opener)
try:
urllib.request.urlretrieve(image_url, savepath)
except FileNotFoundError:
os.makedirs(dir_path)
urllib.request.urlretrieve(image_url, savepath)