33import requests
44import argparse
55from Bio import Entrez
6+ import xml .etree .ElementTree as ET
7+ from datetime import datetime
8+ import io
9+ import pandas as pd
10+ import numpy as np
11+ import html
12+ from typing import List , Dict
613
714class PapersDownloader :
815 headers = {"User-Agent" : "Mozilla/5.0" }
916 output_folder = "./literature"
10- sleep_time = .5
17+ sleep_time = 0
1118
1219 @classmethod
13- def create_directory (cls ):
20+ def create_directory (cls ) -> None :
21+ """
22+ Create the output directory if it doesn't exist.
23+ """
1424 if os .path .exists (cls .output_folder ):
1525 for filename in os .listdir (cls .output_folder ):
1626 file_path = os .path .join (cls .output_folder , filename )
@@ -25,32 +35,202 @@ def create_directory(cls):
2535 os .makedirs (cls .output_folder )
2636
2737 @staticmethod
28- def get_args ():
38+ def get_args () -> argparse .Namespace :
39+ """
40+ Get command-line arguments.
41+
42+ Returns:
43+ argparse.Namespace: The parsed arguments.
44+ """
2945 parser = argparse .ArgumentParser (description = "Download PDFs from PMC." )
3046 parser .add_argument ("--query" , type = str , help = "Search query for PMC articles." )
3147 parser .add_argument ("--email" , type = str , help = "Your email address for NCBI." )
48+ parser .add_argument ("--top_n" , type = int , help = "Number of top articles to download." , default = 10 )
49+ parser .add_argument ("--journal_w" , type = float , help = "Journal ranking weight." , default = 0.9 )
50+ parser .add_argument ("--year_w" , type = float , help = "Publication year weight." , default = 0.5 )
51+ parser .add_argument ("--citation_w" , type = float , help = "Citations weight." , default = 0.7 )
3252 args = parser .parse_args ()
3353 return args
3454
3555 @staticmethod
36- def get_total_count (query , db = "pmc" ):
56+ def get_total_count (query :str , db :str = "pmc" ) -> int :
57+ """
58+ Get the total number of articles found for a given query.
59+
60+ Args:
61+ query (str): The search query.
62+ db (str): The database to search (default: "pmc").
63+
64+ Returns:
65+ int: The total number of articles found.
66+ """
67+ print (f"{ datetime .now ()} - Searching for articles with query: { query } " )
3768 try :
3869 handle = Entrez .esearch (db = db , term = query , retmax = 0 )
3970 record = Entrez .read (handle )
71+ print (f"{ datetime .now ()} - Total articles found: { record ['Count' ]} " )
4072 return int (record ["Count" ])
4173 except Exception as e :
4274 print (f"Error searching for query '{ query } ': { e } " )
4375 return 0
4476
4577 @staticmethod
46- def get_pmc_ids (query , retstart , retmax = 100 , db = "pmc" ):
47- handle = Entrez .esearch (db = "pmc" , term = query , retstart = retstart , retmax = retmax )
78+ def get_pmc_ids (query :str , retstart :int , retmax :int = 100 , db :str = "pmc" )-> List [str ]:
79+ """
80+ Get a list of PMC IDs for a given query.
81+
82+ Args:
83+ query (str): The search query.
84+ retstart (int): The start index for retrieving results.
85+ retmax (int): The maximum number of results to retrieve (default: 100).
86+ db (str): The database to search (default: "pmc").
87+
88+ Returns:
89+ List[str]: A list of PMC IDs.
90+ """
91+ handle = Entrez .esearch (db = db , term = query , retstart = retstart , retmax = retmax )
4892 record = Entrez .read (handle )
4993 pmc_ids = record ["IdList" ]
5094 return pmc_ids
5195
96+ @staticmethod
97+ def get_articles_info (id_list :List [str ], db = "pmc" )-> ET .Element :
98+ """
99+ Get detailed information about a list of articles.
100+
101+ Args:
102+ id_list (List[str]): A list of article IDs.
103+ db (str): The database to search (default: "pmc").
104+
105+ Returns:
106+ ET.Element: An XML element containing the article information.
107+ """
108+ # print datetime.now up to the seconds
109+ print (f"{ datetime .now ()} - Fetching articles info..." )
110+ fetch_handle = Entrez .efetch (db = db , id = id_list , retmode = "xml" )
111+ data = fetch_handle .read ()
112+ root = ET .fromstring (data )
113+ return root
114+
115+ @classmethod
116+ def get_journal_ranking (cls )-> pd .DataFrame :
117+ """
118+ Get the journal ranking based on the SJR (SCImago Journal Rank) indicator.
119+
120+ Returns:
121+ pd.DataFrame: A DataFrame containing the journal titles and their SJR values.
122+ """
123+ print (f"{ datetime .now ()} - Fetching journal ranking..." )
124+ # URL to the Excel dataset
125+ url = "https://www.scimagojr.com/journalrank.php?out=xls"
126+ # Send a GET request to fetch the content
127+ response = requests .get (url , headers = cls .headers )
128+ response .raise_for_status () # Raise an error if the request failed
129+ # Load the content into a BytesIO stream and read it with pandas
130+ csv_data = io .BytesIO (response .content )
131+ df = pd .read_csv (csv_data , sep = ';' ,decimal = "," )
132+ df = df [['Title' , 'SJR' ]]
133+ df ["Title" ] = df ["Title" ].apply (html .unescape )
134+ # Display the first few rows of the dataset
135+ return df
136+
137+
138+ @classmethod
139+ def get_articles_metrics (cls , query :str , db :str = "pmc" )-> Dict :
140+ """
141+ Get metrics for articles based on the search query.
142+
143+ Args:
144+ query (str): The search query.
145+ db (str): The database to search (default: "pmc").
146+
147+ Returns:
148+ Dict: A dictionary of article metrics.
149+ """
150+ count = cls .get_total_count (query = query , db = db )
151+ pmc_ids = cls .get_pmc_ids (query = query , retstart = 0 , retmax = count , db = db )
152+ root = cls .get_articles_info (id_list = pmc_ids , db = db )
153+ journal_ranking = cls .get_journal_ranking ()
154+ print (f"{ datetime .now ()} - Fetching articles metrics..." )
155+ metrics = {}
156+ for article in root .findall ('.//article' ):
157+ pmc_id = article .find (".//article-id[@pub-id-type='pmc']" ).text
158+ journal_elem = article .find ('.//journal-meta//journal-title-group//journal-title' )
159+ journal_title = journal_elem .text if journal_elem is not None else "N/A"
160+ journal_SJR = float (journal_ranking [journal_ranking ['Title' ] == journal_title ]['SJR' ].values [0 ]) if journal_title in journal_ranking ['Title' ].values else 0
161+ pub_date_elem = article .find ('.//article-meta//pub-date/year' )
162+ pub_year = int (pub_date_elem .text ) if pub_date_elem is not None else "N/A"
163+ citations = 0
164+ # This will find all <ref> elements under any <ref-list>
165+ for ref in article .findall ('.//ref-list//ref' ):
166+ citations += 1
167+ metrics [pmc_id ] = {"journal_title" : journal_title ,
168+ "journal_ranking" :journal_SJR ,
169+ "pub_year" : pub_year ,
170+ "citations" : float (citations )}
171+ return metrics
172+
173+
174+ @staticmethod
175+ def rank_articles (metrics :Dict ,journal_w = 0.9 ,year_w = 0.5 ,citation_w = 0.7 )-> pd .DataFrame :
176+ """
177+ Rank articles based on journal ranking, publication year, and number of citations.
178+
179+ Args:
180+ metrics (Dict): A dictionary of article metrics.
181+ journal_w (float): The weight for journal ranking (default: 0.9).
182+ year_w (float): The weight for publication year (default: 0.5).
183+ citation_w (float): The weight for number of citations (default: 0.7).
184+
185+ Returns:
186+ pd.DataFrame: A DataFrame of ranked articles.
187+ """
188+ print (f"{ datetime .now ()} - Ranking articles..." )
189+ df = pd .DataFrame (metrics )
190+ df = df .T
191+ ## normalize the columns
192+ df ['journal_ranking' ] = (df ['journal_ranking' ] - df ['journal_ranking' ].min ()) / (df ['journal_ranking' ].max () - df ['journal_ranking' ].min ())
193+ df ['pub_year' ] = (df ['pub_year' ] - df ['pub_year' ].min ()) / (df ['pub_year' ].max () - df ['pub_year' ].min ())
194+ # citations logaritmic before normalization
195+ df ['citations' ] = pd .to_numeric (df ['citations' ], errors = 'coerce' ).fillna (0 )
196+ df ['citations' ] = np .log10 (df ['citations' ]+ 1e-6 )
197+ df ['citations' ] = (df ['citations' ] - df ['citations' ].min ()) / (df ['citations' ].max () - df ['citations' ].min ())
198+ df ['score' ]= journal_w * df ['journal_ranking' ]+ year_w * df ['pub_year' ]+ citation_w * df ['citations' ]
199+ df .sort_values (by = 'score' ,ascending = False )
200+ return df
201+
52202 @classmethod
53- def download_pdf (cls , pmc_id , pdf_url ):
203+ def get_top_n_articles (cls , query :str , n_top_articles = 10 , db = "pmc" ,** kwargs )-> pd .DataFrame :
204+ """
205+ Get the top N articles based on the ranking.
206+
207+ Args:
208+ query (str): The search query.
209+ n_top_articles (int): The number of top articles to retrieve.
210+ db (str): The database to search (default: "pmc").
211+ **kwargs: Additional keyword arguments for ranking.
212+
213+ Returns:
214+ pd.DataFrame: A DataFrame of the top N articles.
215+ """
216+ metrics = cls .get_articles_metrics (query = query , db = db )
217+ df = cls .rank_articles (metrics = metrics ,** kwargs )
218+ print (f"{ datetime .now ()} - Getting top { n_top_articles } articles..." )
219+ df = df .iloc [:n_top_articles ]
220+ return df
221+
222+ @classmethod
223+ def download_pdf (cls , pmc_id :str , pdf_url :str )-> None :
224+ """
225+ Download a PDF file from a given URL.
226+
227+ Args:
228+ pmc_id (str): The PMC ID of the article.
229+ pdf_url (str): The URL of the PDF file.
230+
231+ Returns:
232+ None
233+ """
54234 response = requests .get (pdf_url , headers = cls .headers , timeout = 30 )
55235 if response .status_code == 200 and response .headers .get ("Content-Type" , "" ).startswith ("application/pdf" ):
56236 file_path = os .path .join (cls .output_folder , f"{ pmc_id } .pdf" )
@@ -61,47 +241,69 @@ def download_pdf(cls, pmc_id, pdf_url):
61241 print (f" Failed to download { pmc_id } (Status: { response .status_code } )" )
62242
63243 @classmethod
64- def download_pdfs (cls , pmc_ids ):
244+ def download_pdfs (cls , pmc_ids :List [str ])-> None :
245+ """
246+ Download PDF files for a list of PMC IDs.
247+
248+ Args:
249+ pmc_ids (List[str]): A list of PMC IDs.
250+
251+ Returns:
252+ None
253+ """
65254 for pmc_id in pmc_ids :
66255 if not pmc_id .startswith ("PMC" ):
67256 pmc_id = "PMC" + pmc_id
68257 pdf_url = f"https://www.ncbi.nlm.nih.gov/pmc/articles/{ pmc_id } /pdf/"
69258 try :
70- cls .download_pdf (pmc_id , pdf_url )
259+ cls .download_pdf (pmc_id = pmc_id , pdf_url = pdf_url )
71260 except Exception as e :
72261 print (f" Error downloading { pmc_id } : { e } " )
73262 time .sleep (cls .sleep_time )
74263
75264 @classmethod
76- def batch_download_pdfs (cls , query , retmax = 100 , db = "pmc" ):
77- total_count = cls .get_total_count (query , db )
78- if total_count == 0 :
265+ def batch_download_pdfs (cls , query :str , retmax = 100 , db = "pmc" ,** kwargs )-> None :
266+ """
267+ Batch download PDF files for a given query.
268+
269+ Args:
270+ query (str): The search query.
271+ retmax (int): The maximum number of articles to retrieve in each batch.
272+ db (str): The database to search (default: "pmc").
273+ **kwargs: Additional keyword arguments for ranking.
274+
275+ Returns:
276+ None
277+ """
278+ top_n_articles = cls .get_top_n_articles (query = query , n_top_articles = 10 , db = db ,** kwargs )
279+ print (f"{ datetime .now ()} - Downloading top { top_n_articles .shape [0 ]} articles..." )
280+ if top_n_articles .shape [0 ] == 0 :
79281 return
80- for start in range (0 , total_count , retmax ):
81- print (f" Processing batch starting at record { start } " )
82- try :
83- pmc_ids = cls .get_pmc_ids (query , start , retmax , db )
84- except Exception as e :
85- print (f" Error retrieving batch starting at { start } : { e } " )
86- continue
87- cls .download_pdfs (pmc_ids )
282+ pmc_ids = top_n_articles .index .tolist ()
283+ chunks = [pmc_ids [i :i + retmax ] for i in range (0 , len (pmc_ids ), retmax )]
284+ for chunk in chunks :
285+ cls .download_pdfs (chunk )
88286
89287 @classmethod
90- def run (cls , query ):
91- if Entrez .email is None :
92- raise ValueError ("Please set your email address" )
93- cls .create_directory ()
94- cls .batch_download_pdfs (query )
95-
96- @classmethod
97- def set_email (cls , email ):
288+ def set_email (cls , email :str )-> None :
289+ """
290+ Set the email address for the NCBI Entrez API.
291+
292+ Args:
293+ email (str): The email address.
294+
295+ Returns:
296+ None
297+ """
98298 if email == 'your.email@mail.com' :
99299 raise ValueError ("Please set your email address" )
100300 Entrez .email = email
101301
302+
102303if __name__ == "__main__" :
103304 args = PapersDownloader .get_args ()
104- query = args .query or '(Multiple Myeloma[Title]) AND ("2021 /01/01"[Publication Date] : "2021/01/10 "[Publication Date])'
305+ query = args .query or '(Multiple Myeloma[Title]) AND ("2024 /01/01"[Publication Date] : "2025/12/31 "[Publication Date])'
105306 email = args .email or "your.email@example.com"
106307 PapersDownloader .set_email (email )
107- PapersDownloader .run (query )
308+ PapersDownloader .create_directory ()
309+ PapersDownloader .batch_download_pdfs (query = query )
0 commit comments