-
Notifications
You must be signed in to change notification settings - Fork 2
Expand file tree
/
Copy pathread_csv.py
More file actions
155 lines (140 loc) · 5.73 KB
/
read_csv.py
File metadata and controls
155 lines (140 loc) · 5.73 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
# importing csv module
import csv
import os
import calculations
def readCSV(path, filename):
fields=[]
rows=[]
demographicsToSearch = { 'ethnicity':'str', 'gender':'str', 'religion':'str', 'income':'int', 'age':'int', 'education':'str', 'location':'str', 'sex':'int'}
# dict to save strings we can count instances of:
Location= {}
Ethnicity = {}
Gender = {}
Religion = {}
Education = {}
# arrays to save values we can calculate mean and variance for:
income = []
age = []
sex = []
# count
iCount = 0
aCount = 0
sCount = 0
# reading csv file
# if there is a folder in dataFiles, walk through that folder too
try:
with open(path+filename, 'r') as csvfile:
# creating a csv reader object
csvreader = csv.reader(csvfile)
# extracting field names through first row
fields = next(csvreader)
# extracting each data row one by one
for row in csvreader:
rows.append(row)
except:
for dirs in os.walk(path):
for d in dirs:
if isinstance(d, str):
try:
with open( d + "/" +filename, 'r') as csvfile:
# creating a csv reader object
csvreader = csv.reader(csvfile)
# extracting field names through first row
fields = next(csvreader)
# extracting each data row one by one
for row in csvreader:
rows.append(row)
except:
x = 0
validRowsIndex = []
validRowsName = []
# TODO need to check if a value is a string or number
countFieldNum = 0 # finding the index of field was causing problems.
for field in fields:
for var in demographicsToSearch: # get the index and name of all values in the file that match the labels we are looking for
if var in field:
validRowsIndex.append(countFieldNum)
validRowsName.append(var.lower())
countFieldNum += 1
print(validRowsIndex)
print(validRowsName)
# for each row, get the value at each valid index and add to array OR dict AND increase count
# (lowercase variables are arrays, capitalized variables are dicts)
for row in rows:
for vR in validRowsIndex:
if len(row) > validRowsIndex[vR] and row[validRowsIndex[vR]] is not None:
# print(vR, validRowsIndex[vR], validRowsName[vR], row[validRowsIndex[vR]])
if 'age' in validRowsName[vR]:
a = float(row[validRowsIndex[vR]])
aCount += 1
age.append(a)
elif 'sex' in validRowsName[vR]:
s = float(row[validRowsIndex[vR]])
sCount += 1
sex.append(s)
elif 'income' in validRowsName[vR]:
i = float(row[validRowsIndex[vR]])
iCount += 1
income.append(i)
elif 'location' in validRowsName[vR]:
if "." not in row[validRowsIndex[vR]]:
if Location.get(row[validRowsIndex[vR]]) is not None:
count = Location.get(row[validRowsIndex[vR]])+1
Location.update({row[validRowsIndex[vR]]: count})
else:
Location.update({row[validRowsIndex[vR]]:1})
elif 'ethnicity' in validRowsName[vR].lower():
if Ethnicity.get(row[validRowsIndex[vR]]) is not None:
count = Ethnicity.get(row[validRowsIndex[vR]])+1
Ethnicity.update({row[validRowsIndex[vR]]: count})
else:
Ethnicity.update({row[validRowsIndex[vR]]:1})
elif 'gender' in validRowsName[vR].lower():
if Gender.get(row[validRowsIndex[vR]]) is not None:
count = Gender.get(row[validRowsIndex[vR]])+1
Gender.update({row[validRowsIndex[vR]]: count})
else:
Gender.update({row[validRowsIndex[vR]]:1})
elif 'religion' in validRowsName[vR].lower():
if Religion.get(row[validRowsIndex[vR]]) is not None:
count = Religion.get(row[validRowsIndex[vR]])+1
Religion.update({row[validRowsIndex[vR]]: count})
else:
Religion.update({row[validRowsIndex[vR]]:1})
elif 'education' in validRowsName[vR].lower():
if Education.get(row[validRowsIndex[vR]]) is not None:
count = Education.get(row[validRowsIndex[vR]])+1
Education.update({row[validRowsIndex[vR]]: count})
else:
Education.update({row[validRowsIndex[vR]]:1})
labelDict = {} # dict to return to runTerminalCommands
# TODO : do analysis on string values
if len(age) > 1:
ageVar = calculations.calcVariance(age)
ageMean = calculations.calcMean(age)
ageHist = calculations.calcHistogram(age, "Age")
labelDict.update({"Age": ["Count: " + str(aCount), "Variance: " + str(ageVar), "Mean: " + str(ageMean), ageHist]})
if len(sex) > 1:
sexCount = calculations.calcBreakDown(sex)
sexVar = calculations.calcVariance(sex)
sexMean = calculations.calcMean(sex)
uniqueValues = calculations.calcUniqueValues(sex)
sexHist = calculations.calcHistogram(sex, "Sex")
if uniqueValues <= 2: # check if only included 2 or less sex data points
labelDict.update({"Sex": ["Breakdown: " + sexCount, "Count: " + str(sCount), "Variance: " + str(sexVar), "Mean: " + str(sexMean), sexHist, "Recommendations: " +"You only included " + str(uniqueValues) + " Sex data points.", "If you did not include Intersex or Transgender people, consider how this might impact your results."]})
else:
labelDict.update({"Sex": ["Breakdown: " + sexCount, "Count: " + str(sCount), "Variance: " + str(sexVar), "Mean: " + str(sexMean), sexHist]})
if len(income) > 1:
incomeVar = calculations.calcVariance(income)
incomeMean = calculations.calcMean(income)
incomeHist = calculations.calcHistogram(income, "Income")
labelDict.update({"Income": ["Count: " + str(iCount), "Variance: " + str(incomeVar), "Mean: " + str(incomeMean), incomeHist]})
if len(Ethnicity) > 0:
labelDict.update({'Ethnicity': Ethnicity})
if len(Gender) > 0:
if len(Gender) == 2:
Gender.update({'Recommendations: ': "You only have two genders in your data.", '-': "Consider how not including other genders might bias your results and lead to erasure."})
labelDict.update({'Gender':Gender})
else:
labelDict.update({'Gender': Gender})
return labelDict