-
Notifications
You must be signed in to change notification settings - Fork 178
Download and preprocess Pubmed Open Access dataset
Titipat Achakulvisut edited this page May 10, 2019
·
1 revision
Download Pubmed Open-Access subset
wget ftp://ftp.ncbi.nlm.nih.gov/pub/pmc/oa_bulk/*.xml.tar.gzUntar file to data folder
tar -xzf comm_use.A-B.xml.tar.gz --directory data/Running parser in PySpark as follows (see full script [WIP] here)
import os
import pubmed_parser as pp
from pyspark.sql import Row
path_all = pp.list_xml_path('data/')
parse_results_rdd = pubmed_oa_rdd.map(lambda x: Row(file_name=os.path.basename(x), **pp.parse_pubmed_xml(x)))
pubmed_oa_df = parse_results_rdd.toDF()
pubmed_oa_df_sel = pubmed_oa_df[['full_title', 'abstract', 'doi',
'file_name', 'pmc', 'pmid',
'publication_year', 'publisher_id', 'journal', 'subjects']]
pubmed_oa_df_sel.write.parquet('pubmed_oa.parquet')Here is the script to parse only author to separate dataframe
def parse_name(p):
author_list = p.author_list
author_table = list()
if len(author_list) >= 1:
for author in author_list:
r = Row(pmc=p.pmc, pmid=p.pmid, last_name=author[0],
first_name=author[1], affiliation_id=author[2])
author_table.append(r)
return author_table
else:
return None
parse_name_rdd = parse_results_rdd.map(lambda x: parse_name(x)).\
filter(lambda x: x is not None).\
flatMap(lambda xs: [x for x in xs])
parse_name_df = parse_name_rdd.toDF()
parse_name_df.write.parquet('pubmed_oa_author.parquet')And here is the script to parse affiliation to dataframe
def parse_affiliation(p):
affiliation_list = p.affiliation_list
affiliation_table = list()
if len(affiliation_list) >= 1:
for affil in affiliation_list:
r = Row(pmc=p.pmc, pmid=p.pmid,
affiliation_id=affil[0], affiliation=affil[1])
affiliation_table.append(r)
return affiliation_table
else:
return None
parse_affil_rdd = parse_results_rdd.map(lambda x: parse_affiliation(x)).\
filter(lambda x: x is not None).\
flatMap(lambda xs: [x for x in xs])
parse_affil_df = parse_affil_rdd.toDF()
parse_name_df.write.parquet('pubmed_oa_affiliation.parquet')