Skip to content
This repository was archived by the owner on Nov 10, 2025. It is now read-only.

Commit 149dc54

Browse files
arxiv_paper_tool.py
1 parent fd698be commit 149dc54

File tree

1 file changed

+124
-0
lines changed

1 file changed

+124
-0
lines changed
Lines changed: 124 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,124 @@
1+
import os
2+
from crewai import Agent, Task, Crew, LLM
3+
from crewai.process import Process
4+
import os
5+
import re
6+
import time
7+
import urllib.request
8+
import urllib.parse
9+
import xml.etree.ElementTree as ET
10+
from typing import Type, List, Optional
11+
from pydantic import BaseModel, Field
12+
from crewai.tools import BaseTool
13+
import logging
14+
15+
# Configure logging
16+
logging.basicConfig(level=logging.INFO)
17+
logger = logging.getLogger(__name__)
18+
19+
20+
class ArxivToolInput(BaseModel):
21+
search_query: str = Field(..., description="Search query for Arxiv, e.g., 'transformer neural network'")
22+
max_results: int = Field(5, description="Maximum number of results to fetch from Arxiv")
23+
download_pdfs: Optional[bool] = Field(False, description="If True, download PDFs to local folder")
24+
save_dir: Optional[str] = Field("./arxiv_pdfs", description="Directory path to save downloaded PDFs")
25+
use_title_as_filename: Optional[bool] = Field(False, description="Use paper title as PDF filename instead of arXiv ID")
26+
27+
28+
class ArxivPaperTool(BaseTool):
29+
name: str = "Arxiv Paper Fetcher and Downloader"
30+
description: str = "Fetches metadata from Arxiv based on a search query and optionally downloads PDFs."
31+
args_schema: Type[BaseModel] = ArxivToolInput
32+
33+
def _run(self, **kwargs) -> str:
34+
try:
35+
# Unpack arguments from kwargs
36+
args = ArxivToolInput(**kwargs)
37+
38+
logger.info(f"Running Arxiv tool: query='{args.search_query}', max_results={args.max_results}, "
39+
f"download_pdfs={args.download_pdfs}, save_dir='{args.save_dir}', "
40+
f"use_title_as_filename={args.use_title_as_filename}")
41+
42+
papers = self.fetch_arxiv_data(args.search_query, args.max_results)
43+
44+
if args.download_pdfs:
45+
os.makedirs(args.save_dir, exist_ok=True)
46+
for paper in papers:
47+
if paper['pdf_url']:
48+
if args.use_title_as_filename:
49+
safe_title = re.sub(r'[\\/*?:"<>|]', "_", paper['title']).strip()
50+
filename_base = safe_title or paper['arxiv_id']
51+
else:
52+
filename_base = paper['arxiv_id']
53+
filename = f"{filename_base[:500]}.pdf"
54+
save_path = os.path.join(args.save_dir, filename)
55+
self.download_pdf(paper['pdf_url'], save_path)
56+
time.sleep(1) # Be polite to ArXiv
57+
58+
results = []
59+
for p in papers:
60+
authors_str = ', '.join(p['authors'])
61+
summary_snippet = (p['summary'][:300] + "...") if len(p['summary']) > 300 else p['summary']
62+
results.append(
63+
f"Title: {p['title']}\nAuthors: {authors_str}\nPublished: {p['published_date']}\nPDF: {p['pdf_url'] or 'N/A'}\n Summary: {summary_snippet}"
64+
)
65+
66+
return "\n\n" + "-"*80 + "\n\n".join(results)
67+
68+
except Exception as e:
69+
logger.error(f"ArxivTool Error: {str(e)}")
70+
return f"Failed to fetch or download Arxiv papers: {str(e)}"
71+
72+
73+
def fetch_arxiv_data(self, search_query: str, max_results: int) -> List[dict]:
74+
api_url = f"http://export.arxiv.org/api/query?search_query={urllib.parse.quote(search_query)}&start=0&max_results={max_results}"
75+
logger.info(f"Fetching data from Arxiv API: {api_url}")
76+
data = urllib.request.urlopen(api_url).read().decode('utf-8')
77+
root = ET.fromstring(data)
78+
papers = []
79+
80+
for entry in root.findall('{http://www.w3.org/2005/Atom}entry'):
81+
raw_id = entry.find('{http://www.w3.org/2005/Atom}id').text.strip()
82+
arxiv_id = raw_id.split('/')[-1].replace('.', '_')
83+
84+
title = entry.find('{http://www.w3.org/2005/Atom}title').text.strip()
85+
summary = entry.find('{http://www.w3.org/2005/Atom}summary').text.strip()
86+
published = entry.find('{http://www.w3.org/2005/Atom}published').text
87+
authors = [
88+
author.find('{http://www.w3.org/2005/Atom}name').text
89+
for author in entry.findall('{http://www.w3.org/2005/Atom}author')
90+
]
91+
92+
pdf_url = None
93+
for link in entry.findall('{http://www.w3.org/2005/Atom}link'):
94+
if link.attrib.get('title') == 'pdf':
95+
pdf_url = link.attrib.get('href')
96+
break
97+
if not pdf_url:
98+
for link in entry.findall('{http://www.w3.org/2005/Atom}link'):
99+
href = link.attrib.get('href')
100+
if href and 'pdf' in href:
101+
pdf_url = href
102+
break
103+
104+
papers.append({
105+
"arxiv_id": arxiv_id,
106+
"title": title,
107+
"summary": summary,
108+
"authors": authors,
109+
"published_date": published,
110+
"pdf_url": pdf_url
111+
})
112+
113+
return papers
114+
115+
def download_pdf(self, pdf_url: str, save_path: str):
116+
try:
117+
logger.info(f"Downloading PDF from {pdf_url} to {save_path}")
118+
urllib.request.urlretrieve(pdf_url, save_path)
119+
logger.info(f"PDF saved: {save_path}")
120+
except Exception as e:
121+
logger.error(f"Failed to download PDF from {pdf_url}: {e}")
122+
raise
123+
124+

0 commit comments

Comments
 (0)