Skip to content

Commit 52657f4

Browse files
authored
Merge pull request #81 from aryan1165/aryan
Added code to scrape records from PubMed
2 parents 76f603d + 6c5fa1d commit 52657f4

File tree

1 file changed

+96
-0
lines changed

1 file changed

+96
-0
lines changed

pubmed/pubmed_scrape.py

Lines changed: 96 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,96 @@
1+
from Bio import Entrez
2+
3+
def fetch_pubmed_records(email, search_term, max_records=100):
4+
"""
5+
Fetch PubMed records matching the search term.
6+
7+
Parameters:
8+
email (str): Email address for NCBI.
9+
search_term (str): The term to search for in PubMed.
10+
max_records (int): Maximum number of records to fetch.
11+
12+
Returns:
13+
list: List of PubMed IDs (PMIDs).
14+
"""
15+
Entrez.email = email
16+
handle = Entrez.esearch(db="pubmed", term=search_term, retmax=max_records)
17+
record = Entrez.read(handle)
18+
handle.close()
19+
id_list = record["IdList"]
20+
return id_list
21+
22+
def fetch_pubmed_details(id_list):
23+
"""
24+
Fetch detailed information for a list of PubMed IDs.
25+
26+
Parameters:
27+
id_list (list): List of PubMed IDs.
28+
29+
Returns:
30+
dict: Parsed XML data with detailed information for each PubMed ID.
31+
"""
32+
ids = ",".join(id_list)
33+
handle = Entrez.efetch(db="pubmed", id=ids, rettype="xml", retmode="xml")
34+
data = Entrez.read(handle) # Read the data using Entrez.read
35+
handle.close()
36+
return data
37+
38+
def extract_pubmed_info(records):
39+
"""
40+
Extract required information from PubMed records.
41+
42+
Parameters:
43+
records (dict): Parsed XML data from PubMed.
44+
45+
Returns:
46+
list: List of dictionaries with extracted information.
47+
"""
48+
extracted_data = []
49+
for record in records['PubmedArticle']:
50+
medline_citation = record.get('MedlineCitation', {})
51+
article = medline_citation.get('Article', {})
52+
journal = article.get('Journal', {})
53+
journal_issue = journal.get('JournalIssue', {})
54+
pub_date = journal_issue.get('PubDate', {})
55+
56+
# Extract article information
57+
article_info = {
58+
"Title": article.get("ArticleTitle", ""),
59+
"Abstract": ' '.join(article.get("Abstract", {}).get("AbstractText", [])),
60+
"PublicationDate": pub_date.get("Year", ""),
61+
"Authors": [f"{author.get('LastName', '')} {author.get('ForeName', '')}" for author in article.get("AuthorList", [])],
62+
"Journal": journal.get("Title", ""),
63+
"PMID": medline_citation.get("PMID", ""),
64+
}
65+
extracted_data.append(article_info)
66+
return extracted_data
67+
68+
def main():
69+
"""
70+
Main function to fetch, parse, and print PubMed data.
71+
"""
72+
email = "[email protected]" # Replace with your email
73+
search_term = "COVID-19" # Replace with your search term
74+
max_records = 100 # Adjust the number of records to fetch
75+
76+
print("Fetching PubMed records...")
77+
id_list = fetch_pubmed_records(email, search_term, max_records)
78+
79+
print(f"Found {len(id_list)} records. Fetching details...")
80+
records = fetch_pubmed_details(id_list)
81+
82+
print("Extracting information from records...")
83+
extracted_data = extract_pubmed_info(records)
84+
85+
# Print or save the extracted data
86+
for data in extracted_data:
87+
print(f"PMID: {data['PMID']}")
88+
print(f"Title: {data['Title']}")
89+
print(f"Abstract: {data['Abstract']}")
90+
print(f"Publication Date: {data['PublicationDate']}")
91+
print(f"Authors: {', '.join(data['Authors'])}")
92+
print(f"Journal: {data['Journal']}")
93+
print("\n")
94+
95+
if __name__ == "__main__":
96+
main()

0 commit comments

Comments
 (0)