1
+ import json
2
+ import random
3
+ import requests
4
+ from lxml import etree
5
+ import re
6
+ import pymysql
7
+ from DBUtils .PooledDB import PooledDB
8
+ from multiprocessing .pool import ThreadPool
9
+
10
+ POOL = PooledDB (
11
+ creator = pymysql , # 使用链接数据库的模块
12
+ maxconnections = 0 , # 连接池允许的最大连接数,0和None表示不限制连接数
13
+ mincached = 20 , # 初始化时,链接池中至少创建的空闲的链接,0表示不创建
14
+ maxcached = 5 , # 链接池中最多闲置的链接,0和None不限制
15
+ maxshared = 0 , # 链接池中最多共享的链接数量,0和None表示全部共享。PS: 无用,因为pymysql和MySQLdb等模块的 threadsafety都为1,所有值无论设置为多少,_maxcached永远为0,所以永远是所有链接都共享。
16
+ blocking = True , # 连接池中如果没有可用连接后,是否阻塞等待。True,等待;False,不等待然后报错
17
+ maxusage = None , # 一个链接最多被重复使用的次数,None表示无限制
18
+ setsession = [], # 开始会话前执行的命令列表。如:["set datestyle to ...", "set time zone ..."]
19
+ ping = 0 ,
20
+ # ping MySQL服务端,检查是否服务可用。# 如:0 = None = never, 1 = default = whenever it is requested, 2 = when a cursor is created, 4 = when a query is executed, 7 = always
21
+ host = '127.0.0.1' ,
22
+ port = 3306 ,
23
+ user = '123' ,
24
+ password = '123' ,
25
+ database = 'ys' ,
26
+ charset = 'utf8'
27
+ )
28
+ class Ji :
29
+ name = ""
30
+ url = ""
31
+ def __init__ (self , name , url ):
32
+ self .name = name
33
+ self .url = url
34
+
35
+ proxys = []
36
+ header = {'User-Agent' : 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/77.0.3865.90 Safari/537.36' }
37
+ #获取代理
38
+ def get_proxy ():
39
+ global proxys
40
+ r = requests .get ("http://ip.jiangxianli.com/" )
41
+ if (r .status_code == 200 ):
42
+ selector = etree .HTML (r .text )
43
+ proxys = proxys + selector .xpath ("//button[@class=\" btn btn-sm btn-copy\" ]/@data-url" )
44
+ #获取页面html
45
+ def getHtml (url ):
46
+ global proxys
47
+ p = proxys [random .randint (0 ,len (proxys )- 1 )]
48
+ try :
49
+ html = requests .get (url ,proxies = {'http' :p },headers = header ,timeout = 5 )
50
+ if html != None :
51
+ return html .text
52
+ except Exception :
53
+ proxys .remove (p )
54
+ if len (proxys )< 2 :
55
+ get_proxy ()
56
+ return getHtml (url )
57
+ def main ():
58
+ get_proxy ()
59
+ list = []
60
+ for i in range (1 ,5 ):
61
+ selector = etree .HTML (getHtml ("http://www.zuidazy1.net/?m=vod-type-id-{}.html" .format (i )))
62
+ list += selector .xpath ("//li/span/a/@href" )
63
+ #创建线程池
64
+ pool = ThreadPool (20 )
65
+ for urlstr in list :
66
+ url = "http://www.zuidazy1.net" + urlstr
67
+ try :
68
+ pool .apply_async (run , args = (url ,))
69
+ except Exception as e :
70
+ print (e )
71
+ pool .close ()
72
+ pool .join ()
73
+ print ("结束" )
74
+ def run (url ):
75
+ conn = POOL .connection ()
76
+ cursor = conn .cursor ()
77
+ #获取页面数据
78
+ r = getHtml (url )
79
+ selector = etree .HTML (r )
80
+ list = selector .xpath ("//div[@class=\" vodh\" ]/h2/text()" )
81
+ pm = ifnull (list )
82
+ if cursor .execute ("SELECT pf,dy,dq,yy,gkdz,xzdz,id,pctime FROM `ysb` WHERE `pm` LIKE '" + pm + "'" )> 0 :
83
+ try :
84
+ ys = cursor .fetchall ()[0 ]
85
+ id = ys [6 ]
86
+ pf = str (ys [0 ])
87
+ dy = str (ys [1 ])
88
+ dq = str (ys [2 ])
89
+ yy = str (ys [3 ])
90
+ gkdz = str (ys [4 ])
91
+ xzdz = str (ys [5 ])
92
+ pctime = str (ys [7 ])
93
+ if pf == "" or pf == '0' or pf == "0.0" :
94
+ list = selector .xpath ("//div[@class=\" vodh\" ]/label/text()" )
95
+ gx = str (ifnull (list ))
96
+ cursor .execute ("UPDATE `ysb` SET `pf` = '{}' WHERE `ysb`.`id` = {}" .format (str (gx ),str (id )))
97
+ print ("更新" + str (id )+ "pf" )
98
+ conn .commit ()
99
+ if dy == "" :
100
+ list = selector .xpath ("//div[@class=\" vodinfobox\" ]/ul/li[2]/span/text()" )
101
+ gx = ifnull (list )
102
+ cursor .execute ("UPDATE `ysb` SET `dy` = '{}' WHERE `ysb`.`id` = {}" .format (str (gx ),str (id )))
103
+ print ("更新" + str (id )+ "dy" )
104
+ conn .commit ()
105
+ if dq == "" :
106
+ list = selector .xpath ("//div[@class=\" vodinfobox\" ]/ul/li[5]/span/text()" )
107
+ gx = ifnull (list )
108
+ cursor .execute ("UPDATE `ysb` SET `dq` = '{}' WHERE `ysb`.`id` = {}" .format (str (gx ),str (id )))
109
+ print ("更新" + str (id )+ "dq" )
110
+ conn .commit ()
111
+ if yy == "" :
112
+ list = selector .xpath ("//div[@class=\" vodinfobox\" ]/ul/li[6]/span/text()" )
113
+ gx = ifnull (list )
114
+ cursor .execute ("UPDATE `ysb` SET `yy` = '{}' WHERE `ysb`.`id` = {}" .format (str (gx ),str (id )))
115
+ print ("更新" + str (id )+ "yy" )
116
+ conn .commit ()
117
+ if gkdz == "[]" :
118
+ list = selector .xpath ("//div[@id=\" play_1\" ]/ul/li/text()" )
119
+ if len (list )> 0 :
120
+ list = fenji (list )
121
+ gkdz = json .dumps (list , ensure_ascii = False )
122
+ cursor .execute ("UPDATE `ysb` SET `gkdz` = '{}' WHERE `ysb`.`id` = {}" .format (gkdz ,str (id )))
123
+ print ("更新" + str (id )+ "gkdz" )
124
+ conn .commit ()
125
+ if xzdz == "[]" :
126
+ list = selector .xpath ("//div[@id=\" down_1\" ]/ul/li/text()" )
127
+ if len (list )> 0 :
128
+ list = fenji (list )
129
+ xzdz = json .dumps (list , ensure_ascii = False )
130
+ cursor .execute ("UPDATE `ysb` SET `xzdz` = '{}' WHERE `ysb`.`id` = {}" .format (xzdz ,str (id )))
131
+ print ("更新" + str (id )+ "xzdz" )
132
+ conn .commit ()
133
+ if pctime == "" or ys [7 ]== '0' :
134
+ list = selector .xpath ("//div[@class=\" vodinfobox\" ]/ul/li[8]/span/text()" )
135
+ gx = str (ifnull (list ))
136
+ cursor .execute ("UPDATE `ysb` SET `pctime` = '{}' WHERE `ysb`.`id` = {}" .format (str (gx ),str (id )))
137
+ print ("更新" + str (id )+ "pctime" )
138
+ conn .commit ()
139
+ except Exception as e :
140
+ print (e )
141
+ conn .close ()
142
+ def fenji (jilist ):
143
+ list = []
144
+ for j in jilist :
145
+ ji = Ji (j .split ("$" )[0 ],j .split ("$" )[1 ])
146
+ list .append (ji .__dict__ )
147
+ return list
148
+ def ifnull (list ):
149
+ if (len (list )> 0 ):
150
+ return list [0 ]
151
+ return ""
152
+ if __name__ == "__main__" :
153
+ main ()
0 commit comments