11import requests
2- from bs4 import BeautifulSoup
2+ import xml .etree .ElementTree as ET
3+ import html
34import yaml
4- import re
5- from datetime import datetime
6- import time
75
86
9- # 主函数
10- def main ():
11- # Zhicheng Dou的dblp页面URL
12- dblp_url = "https://dblp.org/pid/18/5740.html"
13-
14- # get html page and save to file
15- response = requests .get (dblp_url )
16- with open ("zhicheng_dou.html" , "w" , encoding = "utf-8" ) as f :
17- f .write (response .text )
7+ def extract_dblp_to_yaml ():
8+ url = "https://dblp.org/pid/18/5740.xml"
9+ print (f"Fetching data from { url } ..." )
10+ try :
11+ response = requests .get (url )
12+ response .raise_for_status ()
13+ except requests .RequestException as e :
14+ print (f"Error fetching URL: { e } " )
15+ return
16+
17+ try :
18+ root = ET .fromstring (response .content )
19+ except ET .ParseError as e :
20+ print (f"Error parsing XML: { e } " )
21+ return
22+
23+ papers = []
24+
25+ # 遍历所有 <r> 元素的子元素(出版物)
26+ # XML 结构是 <dblpperson><r><article> 或 <inproceedings> ...
27+ for r in root .findall ('./r' ):
28+ for pub in r : # pub 是 <article>, <inproceedings> 等
29+
30+ # 提取基本信息
31+ title = pub .find ('title' )
32+ title_text = title .text if title is not None else "Unknown Title"
33+ # 如果存在末尾的点则移除
34+ if title_text .endswith ('.' ):
35+ title_text = title_text [:- 1 ]
36+
37+ authors = []
38+ for author in pub .findall ('author' ):
39+ if author .text :
40+ authors .append (author .text )
41+
42+ # 出版社/会议地点
43+ # 优先级:期刊 -> 论文集 -> 学校(针对论文)
44+ publisher = "Unknown Publisher"
45+ journal = pub .find ('journal' )
46+ booktitle = pub .find ('booktitle' )
47+ school = pub .find ('school' )
48+
49+ if journal is not None and journal .text :
50+ publisher = journal .text
51+ elif booktitle is not None and booktitle .text :
52+ publisher = booktitle .text
53+ elif school is not None and school .text :
54+ publisher = school .text
55+
56+ # 日期
57+ year = pub .find ('year' )
58+ year_text = year .text if year is not None else "0000"
59+ month = pub .find ('month' )
60+ month_text = month .text if month is not None else "01"
61+
62+ # 常见月份映射的简单字典
63+ month_map = {
64+ 'January' : '01' , 'February' : '02' , 'March' : '03' , 'April' : '04' ,
65+ 'May' : '05' , 'June' : '06' , 'July' : '07' , 'August' : '08' ,
66+ 'September' : '09' , 'October' : '10' , 'November' : '11' , 'December' : '12' ,
67+ 'Jan' : '01' , 'Feb' : '02' , 'Mar' : '03' , 'Apr' : '04' ,
68+ 'Jun' : '06' , 'Jul' : '07' , 'Aug' : '08' ,
69+ 'Sep' : '09' , 'Oct' : '10' , 'Nov' : '11' , 'Dec' : '12'
70+ }
71+ # 清理月份文本
72+ month_text_clean = month_text .strip ()
73+ # 处理 "January" 或 "Jan" 或纯数字等情况
74+ if month_text_clean in month_map :
75+ month_num = month_map [month_text_clean ]
76+ elif month_text_clean .isdigit ():
77+ month_num = month_text_clean .zfill (2 )
78+ else :
79+ month_num = "01" # 默认回退
80+
81+ date_str = f"{ year_text } -{ month_num } -01"
82+
83+ # 链接和 ID
84+ # 首选 'ee'(电子版),通常是 DOI 或 URL
85+ ee = pub .find ('ee' )
86+ if ee is not None and ee .text :
87+ link = ee .text
88+ else :
89+ # 回退到 DBLP URL
90+ key = pub .get ('key' )
91+ link = f"https://dblp.org/{ key } "
92+
93+ # ID:如果可能,尝试使其看起来像 "doi:...",否则使用唯一的标识符
94+ # 用户示例:id: doi:10.1109/TKDE.2023.3291006
95+ paper_id = link
96+ if "doi.org/" in link :
97+ # 提取 doi.org/ 之后的部分
98+ doi_part = link .split ("doi.org/" )[- 1 ]
99+ paper_id = f"doi:{ doi_part } "
100+
101+ # 构建字典
102+ paper_entry = {
103+ 'id' : paper_id ,
104+ 'title' : title_text ,
105+ 'authors' : authors ,
106+ 'publisher' : publisher ,
107+ 'date' : date_str ,
108+ 'link' : link ,
109+ 'type' : 'paper' ,
110+ 'plugin' : 'sources.py' ,
111+ 'file' : 'sources.yaml'
112+ }
113+ papers .append (paper_entry )
114+
115+ # 按日期降序排序(通常做法),或保持 DBLP 顺序(通常是按时间顺序)
116+ # 让我们按降序排序,以便首先显示最新的
117+ papers .sort (key = lambda x : x ['date' ], reverse = True )
118+
119+ output_path = "sources.yaml"
120+ with open (output_path , 'w' , encoding = 'utf-8' ) as f :
121+ # allow_unicode=True 以保留中文字符(如果有)
122+ # sort_keys=False 以保持插入顺序(id, title, authors...)
123+ # 但字典中的插入顺序仅在较新的 Python (3.7+) 中得到保证,这没问题。
124+ # 如果 yaml 默认不遵守顺序,我们可能需要手动强制执行。
125+
126+ # 为了美观严格执行字段顺序:
127+ class OrderedDumper (yaml .SafeDumper ):
128+ pass
129+
130+ def _dict_representer (dumper , data ):
131+ return dumper .represent_dict (data .items ())
132+
133+ OrderedDumper .add_representer (dict , _dict_representer )
134+
135+ # 我们按顺序手动构建了字典,所以这应该有效。
136+ yaml .dump (papers , f , Dumper = OrderedDumper , allow_unicode = True , sort_keys = False , default_flow_style = False )
137+
138+ print (f"Successfully extracted { len (papers )} papers to { output_path } " )
139+
18140
19141if __name__ == "__main__" :
20- main ()
142+ extract_dblp_to_yaml ()
0 commit comments