55import time
66import datetime
77import os
8+ import glob
9+ import yaml
810
911path_to = f'src/content/blog/{ datetime .datetime .now ().strftime ("%Y-%m-%d" )} '
1012
1113if os .path .exists (path_to ):
12- print ("Article already generated today." )
13- exit (0 )
14+ print ("Article already generated today." )
15+ exit (0 )
1416else :
15- os .makedirs (path_to , exist_ok = True )
16- print (f"Created directory { path_to } " )
17+ os .makedirs (path_to , exist_ok = True )
18+ print (f"Created directory { path_to } " )
1719
1820start = time .time ()
1921print ("Connecting to LLM API ..." )
2022deepseek = OpenAI (base_url = "https://api.deepseek.com" , api_key = os .environ .get ("DS_APIKEY" ))
2123print (f"Initialized LLM API. ({ time .time () - start :.1f} s)" )
2224
2325def generate (context , provider , model ):
24- completion = provider .chat .completions .create (
25- model = model ,
26- messages = context
27- )
28- return completion .choices [0 ].message .content .strip ()
26+ completion = provider .chat .completions .create (
27+ model = model ,
28+ messages = context
29+ )
30+ return completion .choices [0 ].message .content .strip ()
2931
3032def scrape_website (url , css_selector ):
31- response = requests .get (url )
32- if response .status_code == 200 :
33- soup = BeautifulSoup (response .content , "html.parser" )
34- elements = soup .select (css_selector )
35- return elements
36- else : return []
33+ response = requests .get (url )
34+ if response .status_code == 200 :
35+ soup = BeautifulSoup (response .content , "html.parser" )
36+ elements = soup .select (css_selector )
37+ return elements
38+ else : return []
39+
40+ # New function to get existing blog posts
41+ def get_existing_blog_posts ():
42+ blog_posts = []
43+ blog_paths = glob .glob ("src/content/blog/*/index.md" )
44+
45+ for path in blog_paths :
46+ try :
47+ with open (path , 'r' , encoding = 'utf-8' ) as f :
48+ content = f .read ()
49+ # Parse frontmatter
50+ if content .startswith ('---' ):
51+ _ , frontmatter , markdown = content .split ('---' , 2 )
52+ metadata = yaml .safe_load (frontmatter )
53+ blog_posts .append ({
54+ 'title' : metadata .get ('title' , '' ),
55+ 'description' : metadata .get ('description' , '' )
56+ })
57+ except Exception as e :
58+ print (f"Error reading { path } : { e } " )
59+
60+ return blog_posts
61+
62+ # Get existing blog posts
63+ existing_posts = get_existing_blog_posts ()
64+ existing_posts_text = "\n " .join ([f"标题: { post ['title' ]} \n 描述: { post ['description' ]} " for post in existing_posts ])
65+ print (f"Loaded { len (existing_posts )} existing blog posts." )
3766
3867topics = [topic .get_text (strip = True ) for topic in scrape_website ("https://news.ycombinator.com/" , ".titleline" )]
3968topics_text = "\n " .join (random .choices (topics , k = random .randint (5 , len (topics ))))
4069print (f"Scraped { len (topics )} topics from Hacker News." )
4170
4271def extract_topic (topics ):
43- global deepseek
44- return generate ([
45- {"role" : "system" , "content" : "你在为一篇技术博客确定一个主题。直接用中文输出主题。" },
46- {"role" : "user" , "content" : f"阅读以下是HackerNews的热门文章,然后写一个可以用于技术博客的主题。这个主题应当是一个通用、普通的技术,不能是一个事件或其它东西。\n \n { topics } \n \n 只需要一个主题,直接输出。" },
47- ], deepseek , "deepseek-chat" )
72+ global deepseek , existing_posts_text
73+ return generate ([
74+ {"role" : "system" , "content" : "你在为一篇技术博客确定一个主题。直接用中文输出主题。" },
75+ {"role" : "user" , "content" : f"阅读以下是HackerNews的热门文章,然后写一个可以用于技术博客的主题。这个主题应当是一个通用、普通的技术,不能是一个事件或其它东西。\n \n { topics } \n \n 以下是已有的博客文章,请避免选择相似的主题: \n \n { existing_posts_text } \n \n 只需要一个主题,直接输出。" },
76+ ], deepseek , "deepseek-chat" )
4877
4978def outline (topic ):
50- global deepseek
51- return generate ([
52- {"role" : "user" , "content" : f"我要写一篇关于「{ topic } 」的博客文章。帮我列一个详细的文章提纲。" }
53- ], deepseek , "deepseek-reasoner" )
79+ global deepseek
80+ return generate ([
81+ {"role" : "user" , "content" : f"我要写一篇关于「{ topic } 」的博客文章。帮我列一个详细的文章提纲。" }
82+ ], deepseek , "deepseek-reasoner" )
5483
5584def write_from_outline (outline ):
56- global deepseek
57- return generate ([
58- {"role" : "user" , "content" : f"{ outline } \n \n 根据这个提纲中关于技术知识的部分,写出一篇技术博客文章。文章中避免出现图片,避免使用列表。每一段出现的代码都进行较为详细的解读。在讲述内容时尽量使用段落的语言,语言风格可以略偏专业,但保持清晰。使用Markdown(要求符合Common Markdown规范)输出,使用LaTeX公式(注意:数学的开闭定界符前后不能有字母或数字字符。像x$a + b = c$或$a + b = c$1将无法渲染为数学公式(所有$会被渲染为$);但x $\infty$ 1和($\infty$)会正常渲染),标题尽量只用一级标题 `#` 和二级标题 `##`,不要用分割线。直接输出正文。" }
59- ], deepseek , "deepseek-reasoner" )
85+ global deepseek , existing_posts_text
86+ return generate ([
87+ {"role" : "system" , "content" : "你是一位专业技术博客作者。在写作时请遵循以下中文排版规范:1) 中文与英文、数字之间需要有空格;2) 中文标点与英文、数字之间不加空格;3) 使用全角中文标点;4) 专有名词大小写正确;5) 英文、数字使用半角字符;6) 使用直角引号「」。" },
88+ {"role" : "user" , "content" : f"{ outline } \n \n 根据这个提纲中关于技术知识的部分,写出一篇技术博客文章。文章中避免出现图片,避免使用列表。每一段出现的代码都进行较为详细的解读。在讲述内容时尽量使用段落的语言,语言风格可以略偏专业,但保持清晰。\n \n 以下是已有的博客文章,请确保你的内容与它们不重复:\n \n { existing_posts_text } \n \n 使用Markdown(要求符合Common Markdown规范)输出,使用LaTeX公式(注意:数学的开闭定界符前后不能有字母或数字字符。像x$a + b = c$或$a + b = c$1将无法渲染为数学公式(所有$会被渲染为$);但x $\\ infty$ 1和($\\ infty$)会正常渲染),标题尽量只用一级标题 `#` 和二级标题 `##`,不要用分割线。直接输出正文。" }
89+ ], deepseek , "deepseek-reasoner" )
6090
6191def summary (article ):
62- global deepseek
63- return generate ([
64- {"role" : "system" , "content" : "你是一个技术博客简介写作者,简介不一定需要涵盖文章的全部内容,能起到一定的提示作用即可。直接输出简介。" },
65- {"role" : "user" , "content" : f"给这篇文章写一个15字的简短介绍:\n \n { article } " }
66- ], deepseek , "deepseek-chat" )
92+ global deepseek
93+ return generate ([
94+ {"role" : "system" , "content" : "你是一个技术博客简介写作者,简介不一定需要涵盖文章的全部内容,能起到一定的提示作用即可。直接输出简介。请遵循中文排版规范,确保中英文之间有空格,使用正确的标点符号 。" },
95+ {"role" : "user" , "content" : f"给这篇文章写一个15字的简短介绍:\n \n { article } " }
96+ ], deepseek , "deepseek-chat" )
6797
6898start = time .time ()
6999print ("Generating topic ..." )
@@ -90,30 +120,30 @@ def summary(article):
90120author = random .choice (["杨其臻" , "杨子凡" , "叶家炜" , "黄京" ])
91121
92122for line in lines :
93- if line .startswith ("# " ):
94- title = line [1 :].strip ()
95- print (f"Detected title: { title } " )
96-
97- metadata = "\n " .join ([
98- "---" ,
99- f'title: "{ title } "' ,
100- f'author: "{ author } "' ,
101- f'date: "{ datetime .datetime .now ().strftime ("%b %d, %Y" )} "' ,
102- f'description: "{ summary_result } "' ,
103- f'latex: true' ,
104- f'pdf: true' ,
105- "---" ,
106- ]) + "\n "
107- print (f"Injecting metadata:\n { metadata .strip ()} " )
108-
109- markdown_file += metadata
110- break
123+ if line .startswith ("# " ):
124+ title = line [1 :].strip ()
125+ print (f"Detected title: { title } " )
126+
127+ metadata = "\n " .join ([
128+ "---" ,
129+ f'title: "{ title } "' ,
130+ f'author: "{ author } "' ,
131+ f'date: "{ datetime .datetime .now ().strftime ("%b %d, %Y" )} "' ,
132+ f'description: "{ summary_result } "' ,
133+ f'latex: true' ,
134+ f'pdf: true' ,
135+ "---" ,
136+ ]) + "\n "
137+ print (f"Injecting metadata:\n { metadata .strip ()} " )
138+
139+ markdown_file += metadata
140+ break
111141
112142for line in lines :
113- if line .startswith ("---" ): continue
114- markdown_file += line + "\n "
143+ if line .startswith ("---" ): continue
144+ markdown_file += line + "\n "
115145
116146with open (f"{ path_to } /index.md" , "w" , encoding = "utf-8" ) as f :
117- f .write (markdown_file )
147+ f .write (markdown_file )
118148
119149print (f"Markdown file generated at { path_to } /index.md" )
0 commit comments