forked from dongfanger/pycnblog
-
Notifications
You must be signed in to change notification settings - Fork 2
Expand file tree
/
Copy pathfind_good_articles.py
More file actions
116 lines (91 loc) · 3.48 KB
/
find_good_articles.py
File metadata and controls
116 lines (91 loc) · 3.48 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
import os
import shutil
import threading
from fuzzywuzzy import fuzz
similarity_threshold = 80
def check_similarity(text):
count = 0
lines = text.split('\n')
# O(n^2)次比较,性能较差;经测试重复相似度效果较好
for i in range(len(lines)):
for j in range(i + 1, len(lines)):
similarity = fuzz.ratio(lines[i], lines[j])
if similarity > similarity_threshold: # 设置阈值
count = count + 1
return count
def is_good_content(content):
# 包含关键字:$$ 表示有公式,```表示有代码
keywords = [
# "$$",
"```",
"背景介绍",
"核心概念与联系",
"核心算法原理",
"代码实例",
"实际应用场景",
"工具和资源",
"未来发展趋势与挑战",
]
for keyword in keywords:
if keyword not in content:
return False
return True
def process_file(file_path, target_good_directory, target_draft_directory):
with open(file_path, 'r') as f:
lines = f.readlines()
cleaned_lines = [line.strip() for line in lines if line.strip()]
content = '\n'.join(cleaned_lines)
# sim_count = check_similarity(content)
# is_not_similar = sim_count < 600
length = len(content)
line_count = len(cleaned_lines)
# target_good_directory
if length >= 2500 and line_count >= 100 and is_good_content(content):
file_name = os.path.basename(file_path)
target_good_directory = os.path.join(target_good_directory, file_name)
shutil.copy(file_path, target_good_directory)
print("process_good_file:", target_good_directory)
# target_draft_directory
if 2000 < length < 2500 and 80 < line_count < 100 and is_good_content(content):
file_name = os.path.basename(file_path)
target_draft_directory = os.path.join(target_draft_directory, file_name)
shutil.copy(file_path, target_draft_directory)
print("process_draft_file:", target_draft_directory)
def find_articles(date):
global f, content
# 定义原始目录和目标目录
source_directory = f'/home/me/tools/pycnblog/articles/{date}'
target_good_directory = f'/home/me/tools/pycnblog/articles_good/{date}'
target_draft_directory = f'/home/me/tools/pycnblog/articles_draft/{date}'
# 创建目标目录
os.makedirs(target_good_directory, exist_ok=True)
os.makedirs(target_draft_directory, exist_ok=True)
threads = []
for file_name in os.listdir(source_directory):
if file_name.endswith('.md'):
file_path = os.path.join(source_directory, file_name)
thread = threading.Thread(target=process_file,
args=(file_path, target_good_directory, target_draft_directory))
threads.append(thread)
# 等待所有线程完成
for t in threads:
t.start()
t.join()
if __name__ == '__main__':
import datetime
# 获取当前日期
now = datetime.datetime.today()
# 存储i天的日期; i=0 即为当天。
dates = []
for i in range(0, 2):
# 减去i天的时间间隔
date = now - datetime.timedelta(days=i)
# 格式化成yyyyMMdd
date_str = date.strftime('%Y%m%d')
# 存储到数组中
dates.append(date_str)
for d in dates:
try:
find_articles(d)
except Exception as e:
print(f"Error occurred while finding good: {e}")