Skip to content

Commit 991661f

Browse files
committed
download top articles
1 parent f14801e commit 991661f

File tree

2 files changed

+237
-31
lines changed

2 files changed

+237
-31
lines changed

HissMed/retrieve_articles.py

Lines changed: 232 additions & 30 deletions
Original file line numberDiff line numberDiff line change
@@ -3,14 +3,24 @@
33
import requests
44
import argparse
55
from Bio import Entrez
6+
import xml.etree.ElementTree as ET
7+
from datetime import datetime
8+
import io
9+
import pandas as pd
10+
import numpy as np
11+
import html
12+
from typing import List, Dict
613

714
class PapersDownloader:
815
headers = {"User-Agent": "Mozilla/5.0"}
916
output_folder = "./literature"
10-
sleep_time = .5
17+
sleep_time = 0
1118

1219
@classmethod
13-
def create_directory(cls):
20+
def create_directory(cls) -> None:
21+
"""
22+
Create the output directory if it doesn't exist.
23+
"""
1424
if os.path.exists(cls.output_folder):
1525
for filename in os.listdir(cls.output_folder):
1626
file_path = os.path.join(cls.output_folder, filename)
@@ -25,32 +35,202 @@ def create_directory(cls):
2535
os.makedirs(cls.output_folder)
2636

2737
@staticmethod
28-
def get_args():
38+
def get_args() -> argparse.Namespace:
39+
"""
40+
Get command-line arguments.
41+
42+
Returns:
43+
argparse.Namespace: The parsed arguments.
44+
"""
2945
parser = argparse.ArgumentParser(description="Download PDFs from PMC.")
3046
parser.add_argument("--query", type=str, help="Search query for PMC articles.")
3147
parser.add_argument("--email", type=str, help="Your email address for NCBI.")
48+
parser.add_argument("--top_n", type=int, help="Number of top articles to download.", default=10)
49+
parser.add_argument("--journal_w", type=float, help="Journal ranking weight.", default=0.9)
50+
parser.add_argument("--year_w", type=float, help="Publication year weight.", default=0.5)
51+
parser.add_argument("--citation_w", type=float, help="Citations weight.", default=0.7)
3252
args = parser.parse_args()
3353
return args
3454

3555
@staticmethod
36-
def get_total_count(query, db="pmc"):
56+
def get_total_count(query:str, db:str="pmc") -> int:
57+
"""
58+
Get the total number of articles found for a given query.
59+
60+
Args:
61+
query (str): The search query.
62+
db (str): The database to search (default: "pmc").
63+
64+
Returns:
65+
int: The total number of articles found.
66+
"""
67+
print(f"{datetime.now()} - Searching for articles with query: {query}")
3768
try:
3869
handle = Entrez.esearch(db=db, term=query, retmax=0)
3970
record = Entrez.read(handle)
71+
print(f"{datetime.now()} - Total articles found: {record['Count']}")
4072
return int(record["Count"])
4173
except Exception as e:
4274
print(f"Error searching for query '{query}': {e}")
4375
return 0
4476

4577
@staticmethod
46-
def get_pmc_ids(query, retstart, retmax=100, db="pmc"):
47-
handle = Entrez.esearch(db="pmc", term=query, retstart=retstart, retmax=retmax)
78+
def get_pmc_ids(query:str, retstart:int, retmax:int=100, db:str="pmc")->List[str]:
79+
"""
80+
Get a list of PMC IDs for a given query.
81+
82+
Args:
83+
query (str): The search query.
84+
retstart (int): The start index for retrieving results.
85+
retmax (int): The maximum number of results to retrieve (default: 100).
86+
db (str): The database to search (default: "pmc").
87+
88+
Returns:
89+
List[str]: A list of PMC IDs.
90+
"""
91+
handle = Entrez.esearch(db=db, term=query, retstart=retstart, retmax=retmax)
4892
record = Entrez.read(handle)
4993
pmc_ids = record["IdList"]
5094
return pmc_ids
5195

96+
@staticmethod
97+
def get_articles_info(id_list:List[str], db="pmc")->ET.Element:
98+
"""
99+
Get detailed information about a list of articles.
100+
101+
Args:
102+
id_list (List[str]): A list of article IDs.
103+
db (str): The database to search (default: "pmc").
104+
105+
Returns:
106+
ET.Element: An XML element containing the article information.
107+
"""
108+
# print datetime.now up to the seconds
109+
print(f"{datetime.now()} - Fetching articles info...")
110+
fetch_handle = Entrez.efetch(db=db, id=id_list, retmode="xml")
111+
data = fetch_handle.read()
112+
root = ET.fromstring(data)
113+
return root
114+
115+
@classmethod
116+
def get_journal_ranking(cls)->pd.DataFrame:
117+
"""
118+
Get the journal ranking based on the SJR (SCImago Journal Rank) indicator.
119+
120+
Returns:
121+
pd.DataFrame: A DataFrame containing the journal titles and their SJR values.
122+
"""
123+
print(f"{datetime.now()} - Fetching journal ranking...")
124+
# URL to the Excel dataset
125+
url = "https://www.scimagojr.com/journalrank.php?out=xls"
126+
# Send a GET request to fetch the content
127+
response = requests.get(url, headers=cls.headers)
128+
response.raise_for_status() # Raise an error if the request failed
129+
# Load the content into a BytesIO stream and read it with pandas
130+
csv_data = io.BytesIO(response.content)
131+
df = pd.read_csv(csv_data, sep=';',decimal=",")
132+
df = df[['Title', 'SJR']]
133+
df["Title"] = df["Title"].apply(html.unescape)
134+
# Display the first few rows of the dataset
135+
return df
136+
137+
138+
@classmethod
139+
def get_articles_metrics(cls, query:str, db:str="pmc")->Dict:
140+
"""
141+
Get metrics for articles based on the search query.
142+
143+
Args:
144+
query (str): The search query.
145+
db (str): The database to search (default: "pmc").
146+
147+
Returns:
148+
Dict: A dictionary of article metrics.
149+
"""
150+
count = cls.get_total_count(query=query, db=db)
151+
pmc_ids=cls.get_pmc_ids(query=query, retstart=0, retmax=count, db=db)
152+
root = cls.get_articles_info(id_list=pmc_ids, db=db)
153+
journal_ranking=cls.get_journal_ranking()
154+
print(f"{datetime.now()} - Fetching articles metrics...")
155+
metrics={}
156+
for article in root.findall('.//article'):
157+
pmc_id=article.find(".//article-id[@pub-id-type='pmc']").text
158+
journal_elem = article.find('.//journal-meta//journal-title-group//journal-title')
159+
journal_title = journal_elem.text if journal_elem is not None else "N/A"
160+
journal_SJR = float(journal_ranking[journal_ranking['Title'] == journal_title]['SJR'].values[0]) if journal_title in journal_ranking['Title'].values else 0
161+
pub_date_elem = article.find('.//article-meta//pub-date/year')
162+
pub_year = int(pub_date_elem.text) if pub_date_elem is not None else "N/A"
163+
citations = 0
164+
# This will find all <ref> elements under any <ref-list>
165+
for ref in article.findall('.//ref-list//ref'):
166+
citations +=1
167+
metrics[pmc_id] = {"journal_title": journal_title,
168+
"journal_ranking":journal_SJR,
169+
"pub_year": pub_year,
170+
"citations": float(citations)}
171+
return metrics
172+
173+
174+
@staticmethod
175+
def rank_articles(metrics:Dict,journal_w=0.9,year_w=0.5,citation_w=0.7)->pd.DataFrame:
176+
"""
177+
Rank articles based on journal ranking, publication year, and number of citations.
178+
179+
Args:
180+
metrics (Dict): A dictionary of article metrics.
181+
journal_w (float): The weight for journal ranking (default: 0.9).
182+
year_w (float): The weight for publication year (default: 0.5).
183+
citation_w (float): The weight for number of citations (default: 0.7).
184+
185+
Returns:
186+
pd.DataFrame: A DataFrame of ranked articles.
187+
"""
188+
print(f"{datetime.now()} - Ranking articles...")
189+
df=pd.DataFrame(metrics)
190+
df=df.T
191+
## normalize the columns
192+
df['journal_ranking'] = (df['journal_ranking'] - df['journal_ranking'].min()) / (df['journal_ranking'].max() - df['journal_ranking'].min())
193+
df['pub_year'] = (df['pub_year'] - df['pub_year'].min()) / (df['pub_year'].max() - df['pub_year'].min())
194+
# citations logaritmic before normalization
195+
df['citations'] = pd.to_numeric(df['citations'], errors='coerce').fillna(0)
196+
df['citations'] = np.log10(df['citations']+1e-6)
197+
df['citations'] = (df['citations'] - df['citations'].min()) / (df['citations'].max() - df['citations'].min())
198+
df['score']=journal_w*df['journal_ranking']+year_w*df['pub_year']+citation_w*df['citations']
199+
df.sort_values(by='score',ascending=False)
200+
return df
201+
52202
@classmethod
53-
def download_pdf(cls, pmc_id, pdf_url):
203+
def get_top_n_articles(cls, query:str, n_top_articles=10, db="pmc",**kwargs)->pd.DataFrame:
204+
"""
205+
Get the top N articles based on the ranking.
206+
207+
Args:
208+
query (str): The search query.
209+
n_top_articles (int): The number of top articles to retrieve.
210+
db (str): The database to search (default: "pmc").
211+
**kwargs: Additional keyword arguments for ranking.
212+
213+
Returns:
214+
pd.DataFrame: A DataFrame of the top N articles.
215+
"""
216+
metrics=cls.get_articles_metrics(query=query, db=db)
217+
df=cls.rank_articles(metrics=metrics,**kwargs)
218+
print(f"{datetime.now()} - Getting top {n_top_articles} articles...")
219+
df=df.iloc[:n_top_articles]
220+
return df
221+
222+
@classmethod
223+
def download_pdf(cls, pmc_id:str, pdf_url:str)->None:
224+
"""
225+
Download a PDF file from a given URL.
226+
227+
Args:
228+
pmc_id (str): The PMC ID of the article.
229+
pdf_url (str): The URL of the PDF file.
230+
231+
Returns:
232+
None
233+
"""
54234
response = requests.get(pdf_url, headers=cls.headers, timeout=30)
55235
if response.status_code == 200 and response.headers.get("Content-Type", "").startswith("application/pdf"):
56236
file_path = os.path.join(cls.output_folder, f"{pmc_id}.pdf")
@@ -61,47 +241,69 @@ def download_pdf(cls, pmc_id, pdf_url):
61241
print(f" Failed to download {pmc_id} (Status: {response.status_code})")
62242

63243
@classmethod
64-
def download_pdfs(cls, pmc_ids):
244+
def download_pdfs(cls, pmc_ids:List[str])->None:
245+
"""
246+
Download PDF files for a list of PMC IDs.
247+
248+
Args:
249+
pmc_ids (List[str]): A list of PMC IDs.
250+
251+
Returns:
252+
None
253+
"""
65254
for pmc_id in pmc_ids:
66255
if not pmc_id.startswith("PMC"):
67256
pmc_id = "PMC" + pmc_id
68257
pdf_url = f"https://www.ncbi.nlm.nih.gov/pmc/articles/{pmc_id}/pdf/"
69258
try:
70-
cls.download_pdf(pmc_id, pdf_url)
259+
cls.download_pdf(pmc_id=pmc_id, pdf_url=pdf_url)
71260
except Exception as e:
72261
print(f" Error downloading {pmc_id}: {e}")
73262
time.sleep(cls.sleep_time)
74263

75264
@classmethod
76-
def batch_download_pdfs(cls, query, retmax=100, db="pmc"):
77-
total_count = cls.get_total_count(query, db)
78-
if total_count == 0:
265+
def batch_download_pdfs(cls, query:str, retmax=100, db="pmc",**kwargs)->None:
266+
"""
267+
Batch download PDF files for a given query.
268+
269+
Args:
270+
query (str): The search query.
271+
retmax (int): The maximum number of articles to retrieve in each batch.
272+
db (str): The database to search (default: "pmc").
273+
**kwargs: Additional keyword arguments for ranking.
274+
275+
Returns:
276+
None
277+
"""
278+
top_n_articles=cls.get_top_n_articles(query=query, n_top_articles=10, db=db,**kwargs)
279+
print(f"{datetime.now()} - Downloading top {top_n_articles.shape[0]} articles...")
280+
if top_n_articles.shape[0] == 0:
79281
return
80-
for start in range(0, total_count, retmax):
81-
print(f" Processing batch starting at record {start}")
82-
try:
83-
pmc_ids = cls.get_pmc_ids(query, start, retmax, db)
84-
except Exception as e:
85-
print(f" Error retrieving batch starting at {start}: {e}")
86-
continue
87-
cls.download_pdfs(pmc_ids)
282+
pmc_ids=top_n_articles.index.tolist()
283+
chunks = [pmc_ids[i:i+retmax] for i in range(0, len(pmc_ids), retmax)]
284+
for chunk in chunks:
285+
cls.download_pdfs(chunk)
88286

89287
@classmethod
90-
def run(cls, query):
91-
if Entrez.email is None:
92-
raise ValueError("Please set your email address")
93-
cls.create_directory()
94-
cls.batch_download_pdfs(query)
95-
96-
@classmethod
97-
def set_email(cls, email):
288+
def set_email(cls, email:str)->None:
289+
"""
290+
Set the email address for the NCBI Entrez API.
291+
292+
Args:
293+
email (str): The email address.
294+
295+
Returns:
296+
None
297+
"""
98298
if email == 'your.email@mail.com':
99299
raise ValueError("Please set your email address")
100300
Entrez.email = email
101301

302+
102303
if __name__ == "__main__":
103304
args = PapersDownloader.get_args()
104-
query = args.query or '(Multiple Myeloma[Title]) AND ("2021/01/01"[Publication Date] : "2021/01/10"[Publication Date])'
305+
query = args.query or '(Multiple Myeloma[Title]) AND ("2024/01/01"[Publication Date] : "2025/12/31"[Publication Date])'
105306
email = args.email or "your.email@example.com"
106307
PapersDownloader.set_email(email)
107-
PapersDownloader.run(query)
308+
PapersDownloader.create_directory()
309+
PapersDownloader.batch_download_pdfs(query=query)

settings.json

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,8 @@
11
{
22
"email": "your.email@mail.com",
3-
"top_references": 5
3+
"top_references": 5,
4+
"top_n_articles": 10,
5+
"citation_weight": 0.7,
6+
"year_weight": 0.5,
7+
"journal_weight": 0.9
48
}

0 commit comments

Comments
 (0)