66import json
77import os
88import re
9+ import time
910
1011import requests
1112from bs4 import BeautifulSoup as bs
1213
14+ from .GetUrls import PCUrls
15+
1316base_columns = ['url' , 'title' , 'date' , 'headlines' , 'copyright' ]
1417A_columns = ['read_num' , 'old_like_num' , 'like_num' ]
1518B_columns = ['comments_num' , 'comments_content' , 'comments_like_num' ]
@@ -127,16 +130,54 @@ def copyright_num(copyright_stat):
127130def copyright_num_detailed (copyright_stat ):
128131 copyright_stat_lst = [14 , 12 , 201 ]
129132 if copyright_stat == 11 :
130- return 1 # 标记原创
133+ return 1 # 标记原创
131134 elif copyright_stat == 100 :
132- return 0 # 荐号
135+ return 0 # 荐号
133136 elif copyright_stat == 101 :
134- return 2 # 转发
137+ return 2 # 转发
135138 elif copyright_stat == 0 :
136- return 3 # 来源非微信文章
139+ return 3 # 来源非微信文章
137140 elif copyright_stat == 1 :
138- return 4 # 形容词(xxx的公众号)
141+ return 4 # 形容词(xxx的公众号)
139142 elif copyright_stat in copyright_stat_lst :
140143 return 5
141144 else :
142- return None
145+ return None
146+
147+
148+ def read_nickname (fname ):
149+ # 读取数据
150+ with open (fname , 'r' , encoding = 'utf-8' ) as f :
151+ haved_data = f .readlines ()
152+ return [line .split (', ' ) for line in haved_data ]
153+
154+
155+ def get_history_urls (biz ,
156+ uin ,
157+ key ,
158+ lst = [],
159+ start_timestamp = 0 ,
160+ count = 10 ,
161+ endcount = 99999 ):
162+ t = PCUrls (biz = biz , uin = uin , cookie = '' )
163+ try :
164+ while True :
165+ res = t .get_urls (key , offset = count )
166+ if res == []:
167+ break
168+ count += 10
169+ print (count )
170+ lst .append (res )
171+ dt = res [- 1 ]["comm_msg_info" ]["datetime" ]
172+ if dt <= start_timestamp or count >= endcount :
173+ break
174+ time .sleep (5 )
175+ except KeyboardInterrupt as e :
176+ print ('程序手动中断' )
177+ return lst
178+ except Exception as e :
179+ print (e )
180+ print ("获取文章链接失败。。。退出程序" )
181+ assert 1 == 2
182+ finally :
183+ return lst
0 commit comments