Skip to content

Commit 9eb4310

Browse files
WinstonLiytCopilot
andauthored
[Refactor] Crawler implementation (abstract, Arxiv, Github) (#27)
Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com>
1 parent e5eb684 commit 9eb4310

File tree

8 files changed

+641
-7
lines changed

8 files changed

+641
-7
lines changed

.github/workflows/update_paper_links.yml

Lines changed: 3 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@ on:
55
workflow_dispatch:
66
schedule:
77
- cron: "0 2 * * 1"
8+
# run every Monday at 2:00 AM
89

910
env:
1011
GITHUB_USER_NAME: WinstonLiyte
@@ -18,6 +19,8 @@ jobs:
1819
steps:
1920
- name: Checkout
2021
uses: actions/checkout@v3
22+
with:
23+
ssh-key: ${{ secrets.DEPLOY_KEY }}
2124

2225
- name: Set up Python Env
2326
uses: actions/setup-python@v4
@@ -34,13 +37,6 @@ jobs:
3437
run: |
3538
python quant_scholar.py --update_paper_links
3639
cp README.md wiki/index.md
37-
38-
- name: Set up SSH
39-
run: |
40-
mkdir -p ~/.ssh
41-
echo "${{ secrets.DEPLOY_KEY }}" > ~/.ssh/id_ed25519
42-
chmod 600 ~/.ssh/id_ed25519
43-
ssh-keyscan github.com >> ~/.ssh/known_hosts
4440
4541
- name: Commit and Push
4642
run: |

autoscholar/crawler/__init__.py

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,5 @@
1+
from .base_crawler import BaseCrawler
2+
from .arxiv_crawler import ArxivCrawler
3+
from .github_crawler import GithubCrawler
4+
5+
__all__ = ["BaseCrawler", "ArxivCrawler", "GithubCrawler"]
Lines changed: 295 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,295 @@
1+
import json
2+
import arxiv
3+
import datetime
4+
import requests
5+
from typing import Dict, List, Any, Optional
6+
from dataclasses import dataclass
7+
from pathlib import Path
8+
9+
from autoscholar.crawler.base_crawler import BaseCrawler
10+
from autoscholar.utils.logger import setup_logger
11+
12+
# ArXiv-specific constants
13+
ARXIV_URL = "http://arxiv.org/"
14+
BASE_URL = "https://arxiv.paperswithcode.com/api/v0/papers/"
15+
16+
# Set up logger
17+
logger = setup_logger(__name__)
18+
19+
20+
@dataclass
21+
class ArxivCrawlerConfig:
22+
"""Configuration class for ArxivCrawler.
23+
24+
Attributes:
25+
----------
26+
output_dir : str
27+
Directory to save the crawled data
28+
max_results : int
29+
Maximum number of papers to fetch per query
30+
download_pdf : bool
31+
Whether to download PDF files
32+
keywords : Dict[str, Any]
33+
Dictionary of search keywords and filters
34+
"""
35+
36+
output_dir: str = "data"
37+
max_results: int = 10
38+
download_pdf: bool = False
39+
keywords: Dict[str, Any] = None
40+
41+
@classmethod
42+
def from_dict(cls, config_dict: Dict[str, Any]) -> "ArxivCrawlerConfig":
43+
"""Create an ArxivCrawlerConfig instance from a dictionary.
44+
45+
Parameters:
46+
----------
47+
config_dict : Dict[str, Any]
48+
Dictionary containing configuration settings
49+
50+
Returns:
51+
-------
52+
ArxivCrawlerConfig
53+
Configured instance
54+
"""
55+
return cls(
56+
output_dir=config_dict.get("output_dir", "data"),
57+
max_results=config_dict.get("max_results", 10),
58+
download_pdf=config_dict.get("download_pdf", False),
59+
keywords=config_dict.get("keywords", {}),
60+
)
61+
62+
63+
class ArxivCrawler(BaseCrawler):
64+
"""Crawler for fetching papers from arXiv.
65+
66+
This crawler uses the arXiv API to fetch papers based on queries
67+
and saves the data in a structured format.
68+
"""
69+
70+
def __init__(self, **kwargs):
71+
"""Initialize the ArXiv crawler.
72+
73+
Parameters:
74+
----------
75+
**kwargs : Any
76+
Optional parameters that can be used by the crawler.
77+
"""
78+
super().__init__(**kwargs)
79+
self.config = ArxivCrawlerConfig.from_dict(kwargs)
80+
self.all_results = {}
81+
82+
def get_authors(
83+
self, authors: List[str], partial_author: bool = False
84+
) -> str:
85+
"""Retrieve a formatted string of authors.
86+
87+
Parameters:
88+
----------
89+
authors : List[str]
90+
List of author names.
91+
partial_author : bool, optional
92+
If True, return only the first three authors.
93+
94+
Returns:
95+
-------
96+
str
97+
String of author names.
98+
"""
99+
if not partial_author:
100+
return ", ".join(str(author) for author in authors)
101+
else:
102+
return ", ".join(str(author) for author in authors[:3])
103+
104+
def _get_pdf_folder(self, topic: str, date: datetime.date) -> Path:
105+
"""Get the folder path for storing PDFs based on topic and date.
106+
107+
Parameters:
108+
----------
109+
topic : str
110+
Paper topic
111+
date : datetime.date
112+
Paper publication date
113+
114+
Returns:
115+
-------
116+
Path
117+
Path to the PDF folder
118+
"""
119+
# Create folder structure: output_dir/arxiv/topic/YYYY-MM
120+
folder_path = (
121+
Path(self.config.output_dir)
122+
/ "arxiv"
123+
/ topic
124+
/ date.strftime("%Y-%m")
125+
)
126+
folder_path.mkdir(parents=True, exist_ok=True)
127+
return folder_path
128+
129+
def _download_pdf(
130+
self, result: arxiv.Result, paper_key: str, topic: str
131+
) -> None:
132+
"""Download PDF for a paper.
133+
134+
Parameters:
135+
----------
136+
result : arxiv.Result
137+
Paper result from arXiv API
138+
paper_key : str
139+
Paper key (ID without version)
140+
topic : str
141+
Topic name for categorization
142+
"""
143+
try:
144+
# Get the appropriate folder based on topic and date
145+
pdf_folder = self._get_pdf_folder(topic, result.published.date())
146+
pdf_path = pdf_folder / f"{paper_key}.pdf"
147+
148+
pdf_response = requests.get(result.pdf_url)
149+
with open(pdf_path, "wb") as f:
150+
f.write(pdf_response.content)
151+
logger.info(f"Downloaded PDF for {paper_key} to {pdf_path}")
152+
except requests.exceptions.RequestException as e:
153+
logger.error(f"Error downloading PDF for {paper_key}: {e}")
154+
155+
def _get_code_url(self, paper_id: str) -> Optional[str]:
156+
"""Get code repository URL for a paper.
157+
158+
Parameters:
159+
----------
160+
paper_id : str
161+
arXiv paper ID
162+
163+
Returns:
164+
-------
165+
Optional[str]
166+
Code repository URL if found, None otherwise
167+
"""
168+
try:
169+
response = requests.get(f"{BASE_URL}{paper_id}").json()
170+
if "official" in response and response["official"]:
171+
return response["official"]["url"]
172+
except (
173+
requests.exceptions.RequestException,
174+
json.JSONDecodeError,
175+
) as e:
176+
logger.error(f"Error getting code URL for {paper_id}: {e}")
177+
return None
178+
179+
def _process_paper(
180+
self, result: arxiv.Result, topic: str
181+
) -> Dict[str, Any]:
182+
"""Process a single paper result.
183+
184+
Parameters:
185+
----------
186+
result : arxiv.Result
187+
Paper result from arXiv API
188+
topic : str
189+
Topic name for categorization
190+
191+
Returns:
192+
-------
193+
Dict[str, Any]
194+
Processed paper data
195+
"""
196+
paper_id = result.get_short_id()
197+
paper_key = paper_id.split("v")[0] # Remove version number
198+
paper_url = f"{ARXIV_URL}abs/{paper_key}"
199+
200+
# Get code repository URL if available
201+
repo_url = self._get_code_url(paper_id)
202+
203+
# Download PDF if enabled
204+
if self.config.download_pdf:
205+
self._download_pdf(result, paper_key, topic)
206+
207+
return {
208+
"topic": topic,
209+
"title": result.title,
210+
"authors": self.get_authors(result.authors),
211+
"first_author": self.get_authors(
212+
result.authors, partial_author=True
213+
),
214+
"abstract": result.summary.replace("\n", " "),
215+
"url": paper_url,
216+
"code_url": repo_url,
217+
"category": result.primary_category,
218+
"publish_time": str(result.published.date()),
219+
"update_time": str(result.updated.date()),
220+
"comments": result.comment.replace("\n", " ")
221+
if result.comment
222+
else "",
223+
}
224+
225+
def _fetch_papers(self, topic: str, query: str, max_results: int) -> None:
226+
"""Fetch papers for a specific topic.
227+
228+
Parameters:
229+
----------
230+
topic : str
231+
Topic name for categorization
232+
query : str
233+
Search query string
234+
max_results : int
235+
Maximum number of papers to fetch
236+
"""
237+
search = arxiv.Search(
238+
query=query,
239+
max_results=max_results,
240+
sort_by=arxiv.SortCriterion.SubmittedDate,
241+
)
242+
243+
for result in search.results():
244+
paper_key = result.get_short_id().split("v")[0]
245+
self.all_results[paper_key] = self._process_paper(result, topic)
246+
logger.info(f"Processed paper: {result.title}")
247+
248+
def _save_results(self) -> None:
249+
"""Save all crawled results to a single JSON file."""
250+
if not self.all_results:
251+
logger.warning("No results to save")
252+
return
253+
254+
output_dir = Path(self.config.output_dir)
255+
output_dir.mkdir(parents=True, exist_ok=True)
256+
257+
today = datetime.date.today().strftime("%Y-%m-%d")
258+
output_path = output_dir / f"arxiv_papers_{today}.json"
259+
260+
# Load existing data if any
261+
if output_path.exists():
262+
with open(output_path, "r") as f:
263+
existing_data = json.load(f)
264+
else:
265+
existing_data = {}
266+
267+
# Update with new data
268+
existing_data.update(self.all_results)
269+
270+
# Write back to file
271+
with open(output_path, "w") as f:
272+
json.dump(existing_data, f, indent=2)
273+
logger.info(f"Saved {len(self.all_results)} papers to {output_path}")
274+
275+
def run(self, **kwargs) -> None:
276+
"""Execute the arXiv crawler workflow."""
277+
logger.info("Starting arXiv crawler")
278+
279+
keywords = self.config.keywords or {}
280+
max_results = self.config.max_results
281+
282+
logger.info("Fetching data begin")
283+
for topic, keyword_info in keywords.items():
284+
if isinstance(keyword_info, dict) and "filters" in keyword_info:
285+
query = " OR ".join(keyword_info["filters"])
286+
topic_max_results = keyword_info.get("max_results", max_results)
287+
else:
288+
query = topic
289+
topic_max_results = max_results
290+
291+
logger.info(f"Processing topic: {topic}, query: {query}")
292+
self._fetch_papers(topic, query, topic_max_results)
293+
294+
self._save_results()
295+
logger.info("Fetching data end")

0 commit comments

Comments
 (0)