-
Notifications
You must be signed in to change notification settings - Fork 1
Expand file tree
/
Copy pathautomatic_update_dcard_comment.py
More file actions
123 lines (105 loc) · 4.58 KB
/
automatic_update_dcard_comment.py
File metadata and controls
123 lines (105 loc) · 4.58 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
import numpy as np
import pandas as pd
import requests
from bs4 import BeautifulSoup
import json
import time
import re
from collections import OrderedDict
import mysql.connector
import datetime
today = str(datetime.date.today())
today = today.replace('-','_')
file_name = 'clean_dcard_food{}.json'.format(today)
dcard_food = pd.read_json(file_name,encoding='utf-8')
assign_list = dcard_food['id']
list1 = []
list2 = []
start = time.time()
print('The program starts...')
count = len(assign_list)
article_count = 0
for i in assign_list:
count = count - 1
print(count, '...')
time.sleep(2)
dcard_url = 'https://www.dcard.tw/f/food/p/' + str(i)
response = requests.get(dcard_url)
soup = BeautifulSoup(response.text)
comment = soup.find_all(attrs={'CommentEntry_content_1ATrw1'})
# Delete the duplications
all_comment = list(set([j.text for j in comment]))
# Assing a empty string to store the clean items from the list, and make all as a long list
clean_comment = ''
# Parse each element in the data list
for item in all_comment:
# clean img links
item = re.sub('http(s?):([/|.|\w|\s|-])*\.(?:jpg|gif|png)', '', item)
# clean http(s) links
item = re.sub(
r'^(http:\/\/www\.|https:\/\/www\.|http:\/\/|https:\/\/)?[a-z0-9]+([\-\.]{1}[a-z0-9]+)*\
.[a-z]{2,5}(:[0-9]{1,5})?(\/.*)?$',
'', item)
# clean XD and so on
item = re.sub('.~|^_^| ̄^ ̄|XDD|ಠ_ಠ|๑´ڡ`๑*|。・ω・。|˶᷄ ̫ ᷅˵|;´༎ຶД༎ຶ`', '', item)
item = re.sub('[ㄅ|ㄆ|ㄇ|ㄈ|ㄉ|ㄊ|ㄋ|ㄌ|ㄍ|ㄎ|ㄏ|ㄐ|ㄑ|ㄒ|ㄓ|ㄔ|ㄕ|ㄖ|ㄗ|ㄘ|ㄙ|ㄧ|ㄨ|ㄩ|ㄚ|ㄛ|ㄜ|ㄝ|ㄞ|ㄟ|ㄠ|ㄡ|ㄢ|ㄣ|ㄤ|ㄥ|ˇ|ˋ|ˊ|˙|!|?|,|.|/|$|@|%|︿|&|*|(|)|_|+|~|~]','',item)
# clean '已經刪除的內容就像 Dcard 一樣,錯過是無法再相見的!'
item = re.sub('已經刪除的內容就像 Dcard 一樣,錯過是無法再相見的!', '', item)
# clean any links
#item = re.sub('.http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+$', '', item)
item = re.sub('(?:(?:https?|ftp|file):\/\/|www\.|ftp\.)(?:\([-A-Z0-9+&@#\/%=~_|$?!:,.]*\)|[-A-Z0-9+&@#\/%=~_|$?!:,.])*(?:\([-A-Z0-9+&@#\/%=~_|$?!:,.]*\)|[A-Z0-9+&@#\/%=~_|$])', '', item)
# clean emojis
RE_EMOJI = re.compile(
'(\u00a9|\u00ae|[\u2000-\u3300]|\ud83c[\ud000-\udfff]|\ud83d[\ud000-\udfff]|\ud83e[\ud000-\udfff]|[\U00010000-\U0010ffff])'
, flags=re.UNICODE)
def strip_emoji(text):
return RE_EMOJI.sub(r'', text)
item = strip_emoji(item)
clean_comment += item
# id_comment_dict['Comment'] = clean_comment
list1.append(clean_comment)
list2.append(i)
data_parse = OrderedDict([('ID', list2), ('Comment', list1)])
dcard_food_df = pd.DataFrame(data_parse)
with open('dcard_food_comment{}.json'.format(today), 'w', encoding='utf-8') as file:
dcard_food_df.to_json(file, force_ascii=False, orient='records')
##############################################
cnx = mysql.connector.connect(user='ray', password='Taiwan#1',
host='127.0.0.1',
database='dcad_db')
cursor = cnx.cursor()
query = ("SELECT id FROM test02")
cursor.execute(query)
id_list =[]
for i in cursor:
id_list.append(i[0])
dcard = pd.read_json('dcard_food_comment{}.json'.format(today),encoding='utf-8')
for i in range(len(dcard)):
content_list = {'comment': str(dcard.iloc[i]['Comment']),'id': int(dcard.iloc[i]['ID'])}
if dcard.iloc[i]['ID'] in id_list: # Update
#Insert into Database
update_article = "UPDATE test02 SET comment = %(comment)s WHERE id = %(id)s"
# Insert new article
cursor.execute(update_article,content_list)
# Make sure data is committed to the database
cnx.commit()
print(i,":",'Updated the database.')
else: # Insert
#Insert into Database
add_article = ("INSERT INTO test02"
"(id, comment)"
"VALUES (%(id)s,%(comment)s)")
# Insert new article
cursor.execute(add_article,content_list)
# Make sure data is committed to the database
cnx.commit()
print(i,":",'Inserted into the database.')
cursor.close()
cnx.close()
##############################################
end = time.time()
minute = round((end - start) / 60)
second = round((end - start) % 60)
# print(dcard_food_df.head()) # show the front rows
print('Finished')
print('Total time:', minute, 'm', second, 's')