datasetbucket/read_csv.py at main · kescardoso/datasetbucket · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
# importing csv module
import csv

import os


import calculations

def readCSV(path, filename):

	fields=[]
	rows=[]

	demographicsToSearch = { 'ethnicity':'str', 'gender':'str', 'religion':'str', 'income':'int', 'age':'int', 'education':'str', 'location':'str', 'sex':'int'}
	# dict to save strings we can count instances of:
	Location= {}
	Ethnicity = {}
	Gender = {}
	Religion = {}
	Education = {}
	# arrays to save values we can calculate mean and variance for:
	income = []
	age = []
	sex = []
	# count
	iCount = 0
	aCount = 0
	sCount = 0

	# reading csv file
  # if there is a folder in dataFiles, walk through that folder too
	try:
		with open(path+filename, 'r') as csvfile:
			# creating a csv reader object
			csvreader = csv.reader(csvfile)

			# extracting field names through first row
			fields = next(csvreader)

			# extracting each data row one by one
			for row in csvreader:
				rows.append(row)
	except:
		for dirs in os.walk(path):
			for d in dirs:
				if isinstance(d, str):

					try:
						with open( d + "/" +filename, 'r') as csvfile:
						# creating a csv reader object
							csvreader = csv.reader(csvfile)

						# extracting field names through first row
							fields = next(csvreader)

						# extracting each data row one by one
							for row in csvreader:
								rows.append(row)
					except:
						x = 0

	validRowsIndex = []
	validRowsName = []
	# TODO need to check if a value is a string or number
	countFieldNum = 0 # finding the index of field was causing problems.
	for field in fields:
		for var in demographicsToSearch: # get the index and name of all values in the file that match the labels we are looking for
			if var in field:
				validRowsIndex.append(countFieldNum)
				validRowsName.append(var.lower())
		countFieldNum += 1
	print(validRowsIndex)
	print(validRowsName)
	# for each row, get the value at each valid index and add to array OR dict AND increase count
	# (lowercase variables are arrays, capitalized variables are dicts)
	for row in rows:
		for vR in validRowsIndex:
			if len(row) > validRowsIndex[vR] and row[validRowsIndex[vR]] is not None:
				# print(vR, validRowsIndex[vR], validRowsName[vR], row[validRowsIndex[vR]])
				if 'age' in validRowsName[vR]:
					a = float(row[validRowsIndex[vR]])
					aCount += 1
					age.append(a)
				elif 'sex' in validRowsName[vR]:
					s = float(row[validRowsIndex[vR]])
					sCount += 1
					sex.append(s)
				elif 'income' in validRowsName[vR]:
					i = float(row[validRowsIndex[vR]])
					iCount += 1
					income.append(i)
				elif 'location' in validRowsName[vR]:
					if "." not in row[validRowsIndex[vR]]:
						if Location.get(row[validRowsIndex[vR]]) is not None:
							count = Location.get(row[validRowsIndex[vR]])+1
							Location.update({row[validRowsIndex[vR]]: count})
						else:
							Location.update({row[validRowsIndex[vR]]:1})
				elif 'ethnicity' in validRowsName[vR].lower():
					if Ethnicity.get(row[validRowsIndex[vR]]) is not None:
						count = Ethnicity.get(row[validRowsIndex[vR]])+1
						Ethnicity.update({row[validRowsIndex[vR]]: count})
					else:
						Ethnicity.update({row[validRowsIndex[vR]]:1})
				elif 'gender' in validRowsName[vR].lower():
					if Gender.get(row[validRowsIndex[vR]]) is not None:
						count = Gender.get(row[validRowsIndex[vR]])+1
						Gender.update({row[validRowsIndex[vR]]: count})
					else:
						Gender.update({row[validRowsIndex[vR]]:1})
				elif 'religion' in validRowsName[vR].lower():
					if Religion.get(row[validRowsIndex[vR]]) is not None:
						count = Religion.get(row[validRowsIndex[vR]])+1
						Religion.update({row[validRowsIndex[vR]]: count})
					else:
						Religion.update({row[validRowsIndex[vR]]:1})
				elif 'education' in validRowsName[vR].lower():
					if Education.get(row[validRowsIndex[vR]]) is not None:
						count = Education.get(row[validRowsIndex[vR]])+1
						Education.update({row[validRowsIndex[vR]]: count})
					else:
						Education.update({row[validRowsIndex[vR]]:1})

	labelDict = {} # dict to return to runTerminalCommands
	# TODO : do analysis on string values
	if len(age) > 1:
		ageVar = calculations.calcVariance(age)
		ageMean = calculations.calcMean(age)
		ageHist = calculations.calcHistogram(age, "Age")
		labelDict.update({"Age": ["Count: " + str(aCount), "Variance: " + str(ageVar), "Mean: " + str(ageMean), ageHist]})
	if len(sex) > 1:
		sexCount = calculations.calcBreakDown(sex)
		sexVar = calculations.calcVariance(sex)
		sexMean = calculations.calcMean(sex)
		uniqueValues = calculations.calcUniqueValues(sex)
		sexHist = calculations.calcHistogram(sex, "Sex")
		if uniqueValues <= 2: # check if only included 2 or less sex data points
			labelDict.update({"Sex": ["Breakdown: " + sexCount, "Count: " + str(sCount), "Variance: " + str(sexVar), "Mean: " + str(sexMean), sexHist, "Recommendations: " +"You only included " + str(uniqueValues) + " Sex data points.", "If you did not include Intersex or Transgender people, consider how this might impact your results."]})
		else:
			labelDict.update({"Sex": ["Breakdown: " + sexCount, "Count: " + str(sCount), "Variance: " + str(sexVar), "Mean: " + str(sexMean), sexHist]})
	if len(income) > 1:
		incomeVar = calculations.calcVariance(income)
		incomeMean = calculations.calcMean(income)
		incomeHist = calculations.calcHistogram(income, "Income")
		labelDict.update({"Income": ["Count: " + str(iCount), "Variance: " + str(incomeVar), "Mean: " + str(incomeMean), incomeHist]})
	if len(Ethnicity) > 0:
		labelDict.update({'Ethnicity': Ethnicity})
	if len(Gender) > 0:
		if len(Gender) == 2:
			Gender.update({'Recommendations: ': "You only have two genders in your data.", '-': "Consider how not including other genders might bias your results and lead to erasure."})
			labelDict.update({'Gender':Gender})
		else:
			labelDict.update({'Gender': Gender})

	return labelDict