DatabaseWordCounter/otherDataFromStudies.py at alzheimer · Marc-Morcos/DatabaseWordCounter · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
import xml.etree.ElementTree as ET
import os
import pandas as pd
import numpy as np
import json

# extract attribute where we expect a single answer (or 0)
def getSingularAttribute(attribute,root):
    all = root.findall(attribute)
    if(len(all) == 0):
        return "Not Specified"
    assert(len(all) == 1)
    return all[0].text

#count value in category
def countValue(category,value, outputDict):
    if not (category in outputDict):
        outputDict[category] = dict()

    if(value in outputDict[category]):
        outputDict[category][value]+=1
    else:
        outputDict[category][value]=1


def main():

    #change directory to current file path
    abspath = os.path.abspath(__file__)
    dname = os.path.dirname(abspath)
    os.chdir(dname)

    #read original csv
    toGet = []
    statusesWeWant = ["Recruiting","Enrolling by invitation","Active, not recruiting","Completed"]
    statusesExcluded = []
    inputFile = os.path.abspath("./SearchResults_1.csv")
    inputDF = pd.read_csv(inputFile, engine='python').dropna(how="all")
    for index, row in inputDF.iterrows():
        yearKeyToLookFor = "Start Date"
        year = row[yearKeyToLookFor]
        if(type(year) != str):
            continue #filter out studies with no start date
        else:
            year=year[-4:]
            assert(year.isnumeric())
            if(int(year)<1995):
                continue #filter out studies before 1995
            elif(int(year)<2020):
                year = "[1995,2020)"
            else:
                year = "2020AndBeyond"

        status = row["Status"]
        NCTNum = row["NCT Number"]

        #exclude statuses we dont want
        if status not in statusesWeWant:
            if status not in statusesExcluded:
                statusesExcluded.append(status)
            continue

        #prepare input
        toGet.append({"year":year,
                       "status":status,
                       "NCTNum":NCTNum})


    results = {
        "[1995,2020)": {
            "enrollmentNotIncludingNonSpecified": []
            },
        "2020AndBeyond": {
            "enrollmentNotIncludingNonSpecified": []
            }
    }
    #read all the xml files and get stats
    for item in toGet:
        filepath = os.path.join(os.getcwd(),"search_results",item["NCTNum"]+".xml")
        parsed = ET.parse(filepath)
        root = parsed.getroot()

        #location country
        countries = root.findall('location_countries')
        outputCountry = "Not Specified"
        NumUS = 0
        NumNonUS = 0
        countriesList = []
        for countrylabel in countries:
                textList = countrylabel.findall("country")
                for text in textList:
                     countriesList.append(text.text.lower())
        if(len(countries) > 0):
            for finalCountry in countriesList:
                if(finalCountry == "united states"):
                    NumUS+=1
                else:
                    NumNonUS+=1
            if(NumUS>0):
                if(NumNonUS>0):
                    outputCountry = "USA+Other"
                else:
                    outputCountry = "USA"
            else:
                outputCountry = "Other"
        countValue("Country",outputCountry,results[item["year"]])

        #gender
        eligibility = root.findall("eligibility")
        assert(len(eligibility) == 1)
        eligibility = eligibility[0]
        gender = getSingularAttribute('gender',eligibility)
        countValue("gender",gender,results[item["year"]])

        #enrollment
        enrollment = getSingularAttribute("enrollment",root)
        if(enrollment!="Not Specified"):
            assert(enrollment.isnumeric())
            enrollment = int(enrollment)
            results[item["year"]]["enrollmentNotIncludingNonSpecified"].append(enrollment)


        study_design_info = root.findall("study_design_info")
        assert(len(study_design_info) <= 1)
        if(len(study_design_info) == 1):
            study_design_info = study_design_info[0]

            #allocation
            allocation = getSingularAttribute("allocation",study_design_info)
            countValue("allocation",allocation,results[item["year"]])

            #intervention_model
            intervention_model=  getSingularAttribute("intervention_model",study_design_info)
            countValue("intervention_model",intervention_model,results[item["year"]])

            #masking
            masking=  getSingularAttribute("masking",study_design_info)
            #get first word
            if(masking != "Not Specified"):
                masking = masking.split()[0]
            countValue("masking",masking,results[item["year"]])

            #primary_purpose
            primary_purpose=  getSingularAttribute("primary_purpose",study_design_info)
            countValue("primary_purpose",primary_purpose,results[item["year"]])
        else:
            countValue("allocation","Not Specified",results[item["year"]])
            countValue("intervention_model","Not Specified",results[item["year"]])
            countValue("masking","Not Specified",results[item["year"]])
            countValue("primary_purpose","Not Specified",results[item["year"]])

    #tally enrollment stats
    for key in results:
        stats = {
            "MEAN":np.mean(results[key]["enrollmentNotIncludingNonSpecified"]),
            "STDDEV":np.std(results[key]["enrollmentNotIncludingNonSpecified"])
        }
        results[key]["enrollmentNotIncludingNonSpecified"] = stats

    outputDir = os.path.abspath("./Output")
    os.makedirs(outputDir, exist_ok = True)
    with open(os.path.join(outputDir,"otherData.txt"), 'w') as f:
        f.writelines([\
        "Results",\
        "\nTotal Studies"+str(len(inputDF)),\
        "\nSections we want:"+str(statusesWeWant),\
        "\nNum studies after applying filters:"+str(len(toGet)),\
        "\nSections we dont want"+str(statusesExcluded),\
        "\n"+json.dumps(results, indent=4),\
        ])


if __name__ == "__main__":
    main()