MarsCraterData-C2-ANOVA-Analysis/MarsCraterData-ANOVA at master · QuantumArchive/MarsCraterData-C2-ANOVA-Analysis · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
# -*- coding: utf-8 -*-
"""
Created on Fri Jun 03 13:34:38 2016

@author: Chris
"""

import pandas
import numpy
import seaborn
import matplotlib.pyplot as plt
import statsmodels.formula.api as smf
import statsmodels.stats.multicomp as multi
#from IPython.display import display
#%matplotlib inline

#bug fix for display formats to avoid run time errors
pandas.set_option('display.float_format', lambda x:'%f'%x)

#Set Pandas to show all columns in DataFrame
pandas.set_option('display.max_columns', None)
#Set Pandas to show all rows in DataFrame
pandas.set_option('display.max_rows', None)

#data here will act as the data frame containing the Mars crater data
data = pandas.read_csv('D:\\Coursera\\marscrater_pds.csv', low_memory=False)

#convert the latitude and diameter columns to numeric and ejecta morphology is categorical
data['LATITUDE_CIRCLE_IMAGE'] = data['LATITUDE_CIRCLE_IMAGE'].convert_objects(convert_numeric=True)
data['DIAM_CIRCLE_IMAGE'] = data['DIAM_CIRCLE_IMAGE'].convert_objects(convert_numeric=True)
data['MORPHOLOGY_EJECTA_1'] = data['MORPHOLOGY_EJECTA_1'].astype('category')
#Any crater with no designated morphology will be replaced with NaN
data['MORPHOLOGY_EJECTA_1'] = data['MORPHOLOGY_EJECTA_1'].replace(' ',numpy.NaN)

print('Some crater morphology contains multiple categories that the morphology may fall under. For this study \
we want to limit the data to craters with well defined morphology, i.e. have only one category they fall under.')

#Creating this function to help identify craters with morphology that only fit into one category (i.e. does not have / in the name)
#if the crater morphology fits more then 1 type, we return a 1 else we return a 0
def identifysingletype(cratertype):
    if '/' in cratertype:
        return 1
    else:
        return 0

print("We will label and filter out craters that fall in more then one category of morphology in our filtered table data2 \
as well as exclude any craters that don't fall into a specific category.")
data2 = data.dropna(subset=['MORPHOLOGY_EJECTA_1'])
data2['MORPH_CATEGORY_1'] = data2['MORPHOLOGY_EJECTA_1'].apply(lambda x: identifysingletype(x))
data3 = data2[(data2['MORPH_CATEGORY_1']==0)]

print('Finally we will exclude any craters which have fewer then 10 of a specific single morphology.')
#note there seems to be something odd that carries over from pandas to seaborn so that even though I filter out
#certain rows in the dataframe, they still appear as variables with 0 value in the plot, so this is a workaround
ejectamorphbin = data3.groupby('MORPHOLOGY_EJECTA_1').size()
ejectamorphbin = ejectamorphbin[ejectamorphbin > 10]
summarytable = pandas.DataFrame({'COUNT':ejectamorphbin}).reset_index()
labels = numpy.array(summarytable['MORPHOLOGY_EJECTA_1'])
counts = numpy.array(summarytable['COUNT'])
orderedarray = ['Rd', 'SLEPS', 'SLERS', 'SLEPC', 'SLERC', 'DLERS', 'DLEPS', 'MLERS',
       'DLEPC', 'DLERC', 'SLEPCPd', 'SLEPSPd', 'SLEPd', 'MLEPS', 'SLERSPd']

print('This plot shows the distribution of crater morphology.')
plotc3 = seaborn.barplot(x=labels,y=counts,order=orderedarray)
plt.xlabel('Crater Morphology Type')
plt.title('Mars Crater Morphology Distribution')
plt.xticks(rotation='vertical')

print('Let us now look at data with only the top 3 morphology types present')
morphofinterest = ['Rd', 'SLEPS', 'SLERS']
data3 = data3[data3['MORPHOLOGY_EJECTA_1'].isin(morphofinterest)]

#create a new dataframe with the slice
latitude = numpy.array(data3['LATITUDE_CIRCLE_IMAGE'])
diameter = numpy.array(data3['DIAM_CIRCLE_IMAGE'])
morphology = numpy.array(data3['MORPHOLOGY_EJECTA_1'])
data4 = pandas.DataFrame({'LATITUDE_CIRCLE_IMAGE':latitude,'DIAM_CIRCLE_IMAGE':diameter,'MORPHOLOGY_EJECTA_1':morphology})

print('We will bin the latitudes into 7 discrete bins of 30 degrees.')
data4['LATITUDE_BIN'] = pandas.cut(data4['LATITUDE_CIRCLE_IMAGE'],[-90,-60,-30,0,30,60,90])

seaborn.factorplot(x='LATITUDE_BIN',y='DIAM_CIRCLE_IMAGE',hue='MORPHOLOGY_EJECTA_1',data=data4)
plt.xlabel('Latitude by Bin (Degrees)')
plt.ylabel('Crater Diameter (km)')
plt.title('Distribution of Martian Crater Diameter by Latitude')
plt.xticks(rotation='vertical')

print('First we will investigate the mean and standard deviation for the 3 specific morphology types.')

summarystatistics = data4.groupby('MORPHOLOGY_EJECTA_1').describe()
summarystatistics2 = summarystatistics.reset_index()

#this is an array containing the stats of interest we want
statsofinterest = ['mean','std']

summarystatistics2 = summarystatistics2[(summarystatistics2['level_1'].isin(statsofinterest))]
summarystatistics3 = summarystatistics2[[0,1,2]]
summarystatistics3 = pandas.pivot_table(summarystatistics3,index='MORPHOLOGY_EJECTA_1',columns=['level_1'],values='DIAM_CIRCLE_IMAGE')
print(summarystatistics3)

print('This will perform an ANOVA on whether the crater diameter varies with EJECTA MORPHOLOGY')
model1 = smf.ols(formula='DIAM_CIRCLE_IMAGE ~ C(MORPHOLOGY_EJECTA_1)', data=data4)
results1 = model1.fit()
print(results1.summary())

print('This will print out the Tukey HSD comparison between the three morphology types')
mc1 = multi.MultiComparison(data4['DIAM_CIRCLE_IMAGE'],data4['MORPHOLOGY_EJECTA_1'])
tukeyres1 = mc1.tukeyhsd()
print(tukeyres1.summary())