Skip to content

Commit d8fb671

Browse files
jjtanjjtan
authored andcommitted
feat: update citation
1 parent 978c954 commit d8fb671

File tree

2 files changed

+140
-16
lines changed

2 files changed

+140
-16
lines changed

_data/sources.yaml

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,5 @@
1+
# DO NOT EDIT, GENERATED AUTOMATICALLY
2+
13
- id: doi:10.1145/3759453
24
title: A Survey of Conversational Search
35
authors:
@@ -301,7 +303,7 @@
301303
- J. Shane Culpepper
302304
- Mohammad Aliannejadi
303305
- James Allan
304-
- Enrique Amigó
306+
- "Enrique Amig\xF3"
305307
- Jaime Arguello
306308
- Leif Azzopardi
307309
- Peter Bailey
@@ -347,7 +349,7 @@
347349
- Ian Soboroff
348350
- Damiano Spina
349351
- Paul Thomas
350-
- Julián Urbano
352+
- "Juli\xE1n Urbano"
351353
- Arjen P. de Vries
352354
- Ryen White
353355
- Abby Yuan

auto_citation.py

Lines changed: 136 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -1,20 +1,142 @@
11
import requests
2-
from bs4 import BeautifulSoup
2+
import xml.etree.ElementTree as ET
3+
import html
34
import yaml
4-
import re
5-
from datetime import datetime
6-
import time
75

86

9-
# 主函数
10-
def main():
11-
# Zhicheng Dou的dblp页面URL
12-
dblp_url = "https://dblp.org/pid/18/5740.html"
13-
14-
# get html page and save to file
15-
response = requests.get(dblp_url)
16-
with open("zhicheng_dou.html", "w", encoding="utf-8") as f:
17-
f.write(response.text)
7+
def extract_dblp_to_yaml():
8+
url = "https://dblp.org/pid/18/5740.xml"
9+
print(f"Fetching data from {url}...")
10+
try:
11+
response = requests.get(url)
12+
response.raise_for_status()
13+
except requests.RequestException as e:
14+
print(f"Error fetching URL: {e}")
15+
return
16+
17+
try:
18+
root = ET.fromstring(response.content)
19+
except ET.ParseError as e:
20+
print(f"Error parsing XML: {e}")
21+
return
22+
23+
papers = []
24+
25+
# 遍历所有 <r> 元素的子元素(出版物)
26+
# XML 结构是 <dblpperson><r><article> 或 <inproceedings> ...
27+
for r in root.findall('./r'):
28+
for pub in r: # pub 是 <article>, <inproceedings> 等
29+
30+
# 提取基本信息
31+
title = pub.find('title')
32+
title_text = title.text if title is not None else "Unknown Title"
33+
# 如果存在末尾的点则移除
34+
if title_text.endswith('.'):
35+
title_text = title_text[:-1]
36+
37+
authors = []
38+
for author in pub.findall('author'):
39+
if author.text:
40+
authors.append(author.text)
41+
42+
# 出版社/会议地点
43+
# 优先级:期刊 -> 论文集 -> 学校(针对论文)
44+
publisher = "Unknown Publisher"
45+
journal = pub.find('journal')
46+
booktitle = pub.find('booktitle')
47+
school = pub.find('school')
48+
49+
if journal is not None and journal.text:
50+
publisher = journal.text
51+
elif booktitle is not None and booktitle.text:
52+
publisher = booktitle.text
53+
elif school is not None and school.text:
54+
publisher = school.text
55+
56+
# 日期
57+
year = pub.find('year')
58+
year_text = year.text if year is not None else "0000"
59+
month = pub.find('month')
60+
month_text = month.text if month is not None else "01"
61+
62+
# 常见月份映射的简单字典
63+
month_map = {
64+
'January': '01', 'February': '02', 'March': '03', 'April': '04',
65+
'May': '05', 'June': '06', 'July': '07', 'August': '08',
66+
'September': '09', 'October': '10', 'November': '11', 'December': '12',
67+
'Jan': '01', 'Feb': '02', 'Mar': '03', 'Apr': '04',
68+
'Jun': '06', 'Jul': '07', 'Aug': '08',
69+
'Sep': '09', 'Oct': '10', 'Nov': '11', 'Dec': '12'
70+
}
71+
# 清理月份文本
72+
month_text_clean = month_text.strip()
73+
# 处理 "January" 或 "Jan" 或纯数字等情况
74+
if month_text_clean in month_map:
75+
month_num = month_map[month_text_clean]
76+
elif month_text_clean.isdigit():
77+
month_num = month_text_clean.zfill(2)
78+
else:
79+
month_num = "01" # 默认回退
80+
81+
date_str = f"{year_text}-{month_num}-01"
82+
83+
# 链接和 ID
84+
# 首选 'ee'(电子版),通常是 DOI 或 URL
85+
ee = pub.find('ee')
86+
if ee is not None and ee.text:
87+
link = ee.text
88+
else:
89+
# 回退到 DBLP URL
90+
key = pub.get('key')
91+
link = f"https://dblp.org/{key}"
92+
93+
# ID:如果可能,尝试使其看起来像 "doi:...",否则使用唯一的标识符
94+
# 用户示例:id: doi:10.1109/TKDE.2023.3291006
95+
paper_id = link
96+
if "doi.org/" in link:
97+
# 提取 doi.org/ 之后的部分
98+
doi_part = link.split("doi.org/")[-1]
99+
paper_id = f"doi:{doi_part}"
100+
101+
# 构建字典
102+
paper_entry = {
103+
'id': paper_id,
104+
'title': title_text,
105+
'authors': authors,
106+
'publisher': publisher,
107+
'date': date_str,
108+
'link': link,
109+
'type': 'paper',
110+
'plugin': 'sources.py',
111+
'file': 'sources.yaml'
112+
}
113+
papers.append(paper_entry)
114+
115+
# 按日期降序排序(通常做法),或保持 DBLP 顺序(通常是按时间顺序)
116+
# 让我们按降序排序,以便首先显示最新的
117+
papers.sort(key=lambda x: x['date'], reverse=True)
118+
119+
output_path = "sources.yaml"
120+
with open(output_path, 'w', encoding='utf-8') as f:
121+
# allow_unicode=True 以保留中文字符(如果有)
122+
# sort_keys=False 以保持插入顺序(id, title, authors...)
123+
# 但字典中的插入顺序仅在较新的 Python (3.7+) 中得到保证,这没问题。
124+
# 如果 yaml 默认不遵守顺序,我们可能需要手动强制执行。
125+
126+
# 为了美观严格执行字段顺序:
127+
class OrderedDumper(yaml.SafeDumper):
128+
pass
129+
130+
def _dict_representer(dumper, data):
131+
return dumper.represent_dict(data.items())
132+
133+
OrderedDumper.add_representer(dict, _dict_representer)
134+
135+
# 我们按顺序手动构建了字典,所以这应该有效。
136+
yaml.dump(papers, f, Dumper=OrderedDumper, allow_unicode=True, sort_keys=False, default_flow_style=False)
137+
138+
print(f"Successfully extracted {len(papers)} papers to {output_path}")
139+
18140

19141
if __name__ == "__main__":
20-
main()
142+
extract_dblp_to_yaml()

0 commit comments

Comments
 (0)