-
Notifications
You must be signed in to change notification settings - Fork 1
Expand file tree
/
Copy pathotherDataFromStudies.py
More file actions
177 lines (149 loc) · 5.97 KB
/
otherDataFromStudies.py
File metadata and controls
177 lines (149 loc) · 5.97 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
import xml.etree.ElementTree as ET
import os
import pandas as pd
import numpy as np
import json
# extract attribute where we expect a single answer (or 0)
def getSingularAttribute(attribute,root):
all = root.findall(attribute)
if(len(all) == 0):
return "Not Specified"
assert(len(all) == 1)
return all[0].text
#count value in category
def countValue(category,value, outputDict):
if not (category in outputDict):
outputDict[category] = dict()
if(value in outputDict[category]):
outputDict[category][value]+=1
else:
outputDict[category][value]=1
def main():
#change directory to current file path
abspath = os.path.abspath(__file__)
dname = os.path.dirname(abspath)
os.chdir(dname)
#read original csv
toGet = []
statusesWeWant = ["Recruiting","Enrolling by invitation","Active, not recruiting","Completed"]
statusesExcluded = []
inputFile = os.path.abspath("./SearchResults_1.csv")
inputDF = pd.read_csv(inputFile, engine='python').dropna(how="all")
for index, row in inputDF.iterrows():
yearKeyToLookFor = "Start Date"
year = row[yearKeyToLookFor]
if(type(year) != str):
continue #filter out studies with no start date
else:
year=year[-4:]
assert(year.isnumeric())
if(int(year)<1995):
continue #filter out studies before 1995
elif(int(year)<2020):
year = "[1995,2020)"
else:
year = "2020AndBeyond"
status = row["Status"]
NCTNum = row["NCT Number"]
#exclude statuses we dont want
if status not in statusesWeWant:
if status not in statusesExcluded:
statusesExcluded.append(status)
continue
#prepare input
toGet.append({"year":year,
"status":status,
"NCTNum":NCTNum})
results = {
"[1995,2020)": {
"enrollmentNotIncludingNonSpecified": []
},
"2020AndBeyond": {
"enrollmentNotIncludingNonSpecified": []
}
}
#read all the xml files and get stats
for item in toGet:
filepath = os.path.join(os.getcwd(),"search_results",item["NCTNum"]+".xml")
parsed = ET.parse(filepath)
root = parsed.getroot()
#location country
countries = root.findall('location_countries')
outputCountry = "Not Specified"
NumUS = 0
NumNonUS = 0
countriesList = []
for countrylabel in countries:
textList = countrylabel.findall("country")
for text in textList:
countriesList.append(text.text.lower())
if(len(countries) > 0):
for finalCountry in countriesList:
if(finalCountry == "united states"):
NumUS+=1
else:
NumNonUS+=1
if(NumUS>0):
if(NumNonUS>0):
outputCountry = "USA+Other"
else:
outputCountry = "USA"
else:
outputCountry = "Other"
countValue("Country",outputCountry,results[item["year"]])
#gender
eligibility = root.findall("eligibility")
assert(len(eligibility) == 1)
eligibility = eligibility[0]
gender = getSingularAttribute('gender',eligibility)
countValue("gender",gender,results[item["year"]])
#enrollment
enrollment = getSingularAttribute("enrollment",root)
if(enrollment!="Not Specified"):
assert(enrollment.isnumeric())
enrollment = int(enrollment)
results[item["year"]]["enrollmentNotIncludingNonSpecified"].append(enrollment)
study_design_info = root.findall("study_design_info")
assert(len(study_design_info) <= 1)
if(len(study_design_info) == 1):
study_design_info = study_design_info[0]
#allocation
allocation = getSingularAttribute("allocation",study_design_info)
countValue("allocation",allocation,results[item["year"]])
#intervention_model
intervention_model= getSingularAttribute("intervention_model",study_design_info)
countValue("intervention_model",intervention_model,results[item["year"]])
#masking
masking= getSingularAttribute("masking",study_design_info)
#get first word
if(masking != "Not Specified"):
masking = masking.split()[0]
countValue("masking",masking,results[item["year"]])
#primary_purpose
primary_purpose= getSingularAttribute("primary_purpose",study_design_info)
countValue("primary_purpose",primary_purpose,results[item["year"]])
else:
countValue("allocation","Not Specified",results[item["year"]])
countValue("intervention_model","Not Specified",results[item["year"]])
countValue("masking","Not Specified",results[item["year"]])
countValue("primary_purpose","Not Specified",results[item["year"]])
#tally enrollment stats
for key in results:
stats = {
"MEAN":np.mean(results[key]["enrollmentNotIncludingNonSpecified"]),
"STDDEV":np.std(results[key]["enrollmentNotIncludingNonSpecified"])
}
results[key]["enrollmentNotIncludingNonSpecified"] = stats
outputDir = os.path.abspath("./Output")
os.makedirs(outputDir, exist_ok = True)
with open(os.path.join(outputDir,"otherData.txt"), 'w') as f:
f.writelines([\
"Results",\
"\nTotal Studies"+str(len(inputDF)),\
"\nSections we want:"+str(statusesWeWant),\
"\nNum studies after applying filters:"+str(len(toGet)),\
"\nSections we dont want"+str(statusesExcluded),\
"\n"+json.dumps(results, indent=4),\
])
if __name__ == "__main__":
main()