-
Notifications
You must be signed in to change notification settings - Fork 5
Expand file tree
/
Copy pathmain.py
More file actions
128 lines (107 loc) · 4.66 KB
/
main.py
File metadata and controls
128 lines (107 loc) · 4.66 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
import time
import json
import pandas as pd
from selenium import webdriver
from items import Topic
StopWords = '停用词库.txt'
driver = webdriver.Chrome()
driver_user = webdriver.Chrome()
driver.get("https://bbs.nga.cn/thread.php?fid=-7")
driver_user.get('https://bbs.nga.cn/nuke.php?func=ucp&uid=36587915')
driver.delete_all_cookies()
driver_user.delete_all_cookies()
def is_ElementExist(ddriver, xpath):
try:
ddriver.find_element_by_xpath(xpath)
return True
except:
return False
def is_number(s):
try:
float(s)
return True
except ValueError:
pass
try:
import unicodedata
unicodedata.numeric(s)
return True
except (TypeError, ValueError):
pass
return False
def now_time():
return time.strftime("%Y-%m-%d-%H-%M-%S", time.localtime())
with open('cookies.txt', 'r') as cookief:
cookieslist = json.load(cookief)
for cookie in cookieslist:
driver.add_cookie(cookie)
driver_user.add_cookie(cookie)
print('cookies加载完成')
driver.refresh()
content = []
time.sleep(1)
for i in range(5):
if not is_ElementExist(driver, '//a[@title="继续访问艾泽拉斯国家地理"]'):
row_list = driver.find_elements_by_xpath('//*[@class="row1 topicrow"]')
row_list.extend(driver.find_elements_by_xpath('//*[@class="row2 topicrow"]'))
for row in row_list:
try:
link = str(row.find_element_by_class_name('replies').get_attribute('href'))
tid = link[32:]
title = row.find_element_by_class_name('topic').text
if title == '帖子发布或回复时间超过限制':
continue
poster_id = str(row.find_element_by_class_name('author').get_attribute('title'))[5:]
post_time = str(row.find_element_by_class_name('silver.postdate').get_attribute('title'))
poster_name = row.find_element_by_class_name('author').text
re_num = row.find_element_by_class_name('replies').text
partition = '水区'
if is_ElementExist(row, './td[@class="c2"]/span[@class="titleadd2"]/a[@class="silver"]'):
partition = row.find_element_by_xpath(
'./td[@class="c2"]/span[@class="titleadd2"]/a[@class="silver"]').text
sampling_time = now_time()
user_history_re_num = ''
user_prestige = ''
user_group = ''
user_reg_date = ''
# 查询楼主成分
if is_number(poster_id):
driver_user.get('https://bbs.nga.cn/nuke.php?func=ucp&uid=' + poster_id)
user_group = driver_user.find_element_by_xpath('//label[contains(text(), "用户组")]/../span/span').text
user_reg_date = driver_user.find_element_by_xpath(
'//label[contains(text(), "注册日期")]/../span/span').text
if is_ElementExist(driver_user, '//label[contains(text(), "威望")]/../span'):
user_prestige = (driver_user.find_element_by_xpath(
'//label[contains(text(), "威望")]/../span').text)[2:]
if is_ElementExist(driver_user, '//label[contains(text(), "发帖数")]/../span'):
user_history_re_num = (driver_user.find_element_by_xpath(
'//label[contains(text(), "发帖数")]/../span').text)[2:]
# 写入字典
item = {'tid': tid,
'title': title,
'reply_num': re_num,
'partition': partition,
'post_time': post_time,
'poster_id': poster_id,
'poster_name': poster_name,
'poster_history_re_num': user_history_re_num,
'poster_prestige': user_prestige,
'poster_group': user_group,
'poster_reg_date': user_reg_date,
'sampling_time': sampling_time}
print(item)
content.append(item)
except:
continue
continue_link = driver.find_element_by_xpath('//a[@title="加载下一页"]')
continue_link.click()
driver.refresh()
else:
time.sleep(3)
skip_link = driver.find_element_by_xpath('//a[@title="继续访问艾泽拉斯国家地理"]')
skip_link.click()
data = pd.DataFrame(content)
data.to_csv("ngaData" + str(now_time()) + ".csv", index=False, sep=',')
time.sleep(5)
driver.quit()
driver_user.quit()