Skip to content

Download and preprocess Pubmed Open Access dataset

Titipat Achakulvisut edited this page May 10, 2019 · 1 revision

Download Pubmed Open-Access subset

wget ftp://ftp.ncbi.nlm.nih.gov/pub/pmc/oa_bulk/*.xml.tar.gz

Untar file to data folder

tar -xzf comm_use.A-B.xml.tar.gz --directory data/

Running parser in PySpark as follows (see full script [WIP] here)

import os
import pubmed_parser as pp
from pyspark.sql import Row

path_all = pp.list_xml_path('data/')
parse_results_rdd = pubmed_oa_rdd.map(lambda x: Row(file_name=os.path.basename(x), **pp.parse_pubmed_xml(x)))
pubmed_oa_df = parse_results_rdd.toDF()
pubmed_oa_df_sel = pubmed_oa_df[['full_title', 'abstract', 'doi', 
                                 'file_name', 'pmc', 'pmid', 
                                 'publication_year', 'publisher_id', 'journal', 'subjects']]
pubmed_oa_df_sel.write.parquet('pubmed_oa.parquet')

Here is the script to parse only author to separate dataframe

def parse_name(p):
    author_list = p.author_list
    author_table = list()
    if len(author_list) >= 1:
        for author in author_list:
            r = Row(pmc=p.pmc, pmid=p.pmid, last_name=author[0], 
                    first_name=author[1], affiliation_id=author[2])
            author_table.append(r)
        return author_table
    else:
        return None

parse_name_rdd = parse_results_rdd.map(lambda x: parse_name(x)).\
    filter(lambda x: x is not None).\
    flatMap(lambda xs: [x for x in xs])

parse_name_df = parse_name_rdd.toDF()
parse_name_df.write.parquet('pubmed_oa_author.parquet')

And here is the script to parse affiliation to dataframe

def parse_affiliation(p):
    affiliation_list = p.affiliation_list
    affiliation_table = list()
    if len(affiliation_list) >= 1:
        for affil in affiliation_list:
            r = Row(pmc=p.pmc, pmid=p.pmid, 
                    affiliation_id=affil[0], affiliation=affil[1])
            affiliation_table.append(r)
        return affiliation_table
    else:
        return None

parse_affil_rdd = parse_results_rdd.map(lambda x: parse_affiliation(x)).\
    filter(lambda x: x is not None).\
    flatMap(lambda xs: [x for x in xs])

parse_affil_df = parse_affil_rdd.toDF()
parse_name_df.write.parquet('pubmed_oa_affiliation.parquet')

Clone this wiki locally