Pubmed_large/parseXML.py at master · kokitsuyuzaki/Pubmed_large · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
#!/usr/bin/python
# -*- coding: utf_8 -*-

import os
from lxml import etree, objectify
import sys
import codecs
import re

# XML file name
#xml_file = os.getcwd() + "/" + sys.argv[1]
xml_file = sys.argv[1]
#print xml_file

# open output file
out1 = open('pubmed.txt', 'a')
out2 = open('pmc.txt', 'a')
out3 = open('mesh.txt', 'a')

# parse XML
tree = objectify.parse(xml_file, parser = etree.XMLParser())
root = tree.getroot()

for i in range(len(root)):
#	print str(i + 1) + " / " + str(len(root))
	try:

		# Journal
		Journal = root[i].xpath("Article")[0].xpath("Journal")[0].xpath("Title")[0].text
		# Year
		Year = root[i].xpath("Article")[0].xpath("Journal")[0].xpath("JournalIssue")[0].xpath("PubDate")[0].xpath("Year")[0].text
		# Title
		Title = root[i].xpath("Article")[0].xpath("ArticleTitle")[0].text
		# Abstruct
		Abstruct = root[i].xpath("Article")[0].xpath("Abstract")[0].xpath("AbstractText")[0].text
		# PMID / 6. Pumbed URL
		PMID = root[i].xpath("PMID")[0].text
		PM_URL = "http://www.ncbi.nlm.nih.gov/pubmed/" + PMID

		# remove duplicate
		Journal = Journal.replace("\t","")
		Journal = Journal.replace("\n","")
		Year = Year.replace("\t","")
		Year = Year.replace("\n","")
		Title = Title.replace("\t","")
		Title = Title.replace("\n","")
		Abstruct = Abstruct.replace("\t","")
		Abstruct = Abstruct.replace("\n","")
		PMID = PMID.replace("\t","")
		PMID = PMID.replace("\n","")
		PM_URL = PM_URL.replace("\t","")
		PM_URL = PM_URL.replace("\n","")

		# output
		out1.write(PMID.encode("utf_8"))
		out1.write("\t")
		out1.write(Journal.encode("utf_8"))
		out1.write("\t")
		out1.write(Year.encode("utf_8"))
		out1.write("\t")
		out1.write(Title.encode("utf_8"))
		out1.write("\t")
		out1.write(Abstruct.encode("utf_8"))
		out1.write("\t")
		out1.write(PM_URL.encode("utf_8"))
		out1.write("\n")

	except IndexError:
		pass
	except AttributeError:
		pass


	#  PMCID / PMCURL
	try:
		match = re.match("PMC\d+", root[i].xpath("OtherID")[0].text)
		if match:
			PMID = root[i].xpath("PMID")[0].text
			PMCID = match.group()
			PMC_URL = "http://www.ncbi.nlm.nih.gov/pmc/articles/" + PMCID + "/pdf/"

			# remove duplicate
			PMID = PMID.replace("\t","")
			PMID = PMID.replace("\n","")
			PMCID = PMCID.replace("\t","")
			PMCID = PMCID.replace("\n","")
			PMC_URL = PMC_URL.replace("\t","")
			PMC_URL = PMC_URL.replace("\n","")

			# output
			out2.write(PMID.encode("utf_8"))
			out2.write("\t")
			out2.write(PMCID.encode("utf_8"))
			out2.write("\t")
			out2.write(PMC_URL.encode("utf_8"))
			out2.write("\n")

	except IndexError:
		pass
	except AttributeError:
		pass


	#  MeSH Category / MeSH Term
	try:

		for j in range(len(root[i].xpath("MeshHeadingList")[0].xpath("MeshHeading"))):
			PMID = root[i].xpath("PMID")[0].text
			MeSH_Term = root[i].xpath("MeshHeadingList")[0].xpath("MeshHeading")[j].xpath("DescriptorName")[0].text

			# remove duplicate
			PMID = PMID.replace("\t","")
			PMID = PMID.replace("\n","")
			MeSH_Term = MeSH_Term.replace("\t","")
			MeSH_Term = MeSH_Term.replace("\n","")

			# output
			out3.write(PMID.encode("utf_8"))
			out3.write("\t")
			out3.write(MeSH_Term.encode("utf_8"))
			out3.write("\n")
	except IndexError:
		pass
	except AttributeError:
		pass

# close
out1.close()
out2.close()
out3.close()