Skip to content

Commit cf8a30b

Browse files
committed
add utils history url
1 parent 05c0d4b commit cf8a30b

File tree

2 files changed

+48
-7
lines changed

2 files changed

+48
-7
lines changed

setup.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,7 @@
66

77
setuptools.setup(
88
name="wechatarticles",
9-
version="0.5.5",
9+
version="0.5.6",
1010
author="wnma3mz",
1111
author_email="wnma3mz@gmail.com",
1212
description="wechat articles scrapy",

wechatarticles/utils.py

Lines changed: 47 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -6,10 +6,13 @@
66
import json
77
import os
88
import re
9+
import time
910

1011
import requests
1112
from bs4 import BeautifulSoup as bs
1213

14+
from .GetUrls import PCUrls
15+
1316
base_columns = ['url', 'title', 'date', 'headlines', 'copyright']
1417
A_columns = ['read_num', 'old_like_num', 'like_num']
1518
B_columns = ['comments_num', 'comments_content', 'comments_like_num']
@@ -127,16 +130,54 @@ def copyright_num(copyright_stat):
127130
def copyright_num_detailed(copyright_stat):
128131
copyright_stat_lst = [14, 12, 201]
129132
if copyright_stat == 11:
130-
return 1 # 标记原创
133+
return 1 # 标记原创
131134
elif copyright_stat == 100:
132-
return 0 # 荐号
135+
return 0 # 荐号
133136
elif copyright_stat == 101:
134-
return 2 # 转发
137+
return 2 # 转发
135138
elif copyright_stat == 0:
136-
return 3 # 来源非微信文章
139+
return 3 # 来源非微信文章
137140
elif copyright_stat == 1:
138-
return 4 # 形容词(xxx的公众号)
141+
return 4 # 形容词(xxx的公众号)
139142
elif copyright_stat in copyright_stat_lst:
140143
return 5
141144
else:
142-
return None
145+
return None
146+
147+
148+
def read_nickname(fname):
149+
# 读取数据
150+
with open(fname, 'r', encoding='utf-8') as f:
151+
haved_data = f.readlines()
152+
return [line.split(', ') for line in haved_data]
153+
154+
155+
def get_history_urls(biz,
156+
uin,
157+
key,
158+
lst=[],
159+
start_timestamp=0,
160+
count=10,
161+
endcount=99999):
162+
t = PCUrls(biz=biz, uin=uin, cookie='')
163+
try:
164+
while True:
165+
res = t.get_urls(key, offset=count)
166+
if res == []:
167+
break
168+
count += 10
169+
print(count)
170+
lst.append(res)
171+
dt = res[-1]["comm_msg_info"]["datetime"]
172+
if dt <= start_timestamp or count >= endcount:
173+
break
174+
time.sleep(5)
175+
except KeyboardInterrupt as e:
176+
print('程序手动中断')
177+
return lst
178+
except Exception as e:
179+
print(e)
180+
print("获取文章链接失败。。。退出程序")
181+
assert 1 == 2
182+
finally:
183+
return lst

0 commit comments

Comments
 (0)