watersecurity/waterUESI.py at main · vaishnavdilip/watersecurity · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140

import pandas as pd
import numpy as np

# import matplotlib.pyplot as plt

from sklearn.decomposition import PCA
from sklearn.neighbors import kneighbors_graph
from sklearn.cluster import KMeans

import plotly.express as px
import plotly.figure_factory as ff

# # Urban Environment & Social Inclusion Index Data

# The index aims to measure how cities perform at the intersection of environment and social equity. In the topic of water security it includes the water resource management issue category with two indicators. Water stress measures the ratio of surface water withdrawn, relative to the total annual natural availability of surface water available, in key sub-basins of interest. Wastewater treatment measures the proportion of wastewater that is treated before it is released back into the environment, relative to the amount of wastewater generated by an urban area.

df_water = pd.read_csv('https://www.dropbox.com/s/51wx95m7lgxl7fq/water.csv?dl=1')
df_water.set_index('city',inplace=True)

#Eliminating variables with high proportion of missing data
df_water = df_water.drop(['INCOME_STD','CLIMPOL.UESI','WATTREAT.UESI','ALBEDO','ELEVATION','DENSITY'], axis=1)

#Eliminating obervations with missing data
df_water.dropna(how='any',inplace=True)
#Eliminanting index variables highly correlated with other variables
df_water = df_water.drop(['PM25','CO2.UESI','NO2.UESI','PM25.UESI','PM25EX.UESI','PUBTRANS.UESI','TRANSCOV.UESI','TREECAP.UESI', 'TREELOSS.UESI','UHI.UESI','WATSTRESS.UESI'], axis=1)

dr = df_water.values
t = df_water.index

#Fitting PCA model with to components to dataset
pca_12 = PCA(n_components=2)
pca_12.fit(dr)

#Dataset with transformed variables
Z=pd.DataFrame(pca_12.transform(dr))
Z.index = t
Z.columns = ['PC1','PC2']

# ## K-means clustering

#Fit k-means model to transformed dataset
kmeans = KMeans(n_clusters=8)
kmeans.fit(Z)
y_label = kmeans.predict(Z)

# ## Spectral clustering

# Adjacency matrix

A = kneighbors_graph(Z, n_neighbors=5).toarray()

# Creating a simple Laplacian $L$

D = np.diag(A.sum(axis=1))
L = D-A

# Determine the eigenvalues and eigenvectors

vals, vecs = np.linalg.eig(L)
vals = np.real(vals)
vecs = np.real(vecs)
vecs_sorted = vecs[:,np.argsort((vals))]
vals_sorted = vals[np.argsort((vals))]

# Find the Fiedler Vector :<br>
# This is the eigenvector which corresponds to the Fiedler Value (= the second eigenvalue)

# Create an array with for every point in the cloud of 500 points the corresponding loading in the `Fiedler Vector`.
# This is stored in an array **U**

U = np.array((vecs_sorted[:,1])).reshape(-1,1)

# These elements are split in nine different clusters using `KMeans ++`

#Fitting model
km = KMeans(init='k-means++', n_clusters=9,random_state=0)
km.fit(U);

#Number of cities in each cluster
unique, counts = np.unique(km.labels_, return_counts=True)

# With this knowledge we can assign each of the points to these clusters

#Positive values for visualization purposes
Z['PC1'] = Z['PC1'] - min(Z['PC1']) + 1

#Positive values for visualization purposes
Z['PC2'] = Z['PC2'] - min(Z['PC2']) + 1

#Adding labels of clusters to transformed dataset
Z['cluster'] = km.labels_

# To visualize the closest neighbors of a given city according to the selected variables of the index, we create the following function

def cluster_neighbors(params):
    city_df = Z[Z.cluster == Z.loc[params['city']]['cluster']]
    city_df = city_df.reset_index()

    #plotting the scatter plot
    fig = px.scatter(city_df, x='PC1',
                   y='PC2', text="city", hover_data=['city'], color_discrete_sequence=['#3782fa'])
    fig.update_traces(textposition='top center')
    fig.update_layout(height=800)
    return fig

#Adding the cluster labels to clean dataset
df_water['cluster'] = km.labels_

# The following function, shows the distribution of water stress in the cluster of a given city

def cluster_waterStress(params):
    city_df = df_water[df_water.cluster == df_water.loc[params['city']]['cluster']]
    city_df = city_df.reset_index()

    fig = ff.create_distplot([city_df['WATSTRESS'].to_numpy()], group_labels=[params['city']], bin_size=0.125)
    fig.add_vline(x=0.2, line_width=3, line_dash="dash", line_color="yellow",
                    annotation_text="Medium Stress",
                    annotation_position="top right",
                    annotation_font_size=10)
    fig.add_vline(x=0.4, line_width=3, line_dash="dash", line_color="red",
                    annotation_text="High Stress",
                    annotation_font_size=10)
    fig.add_vline(x=float(city_df[city_df['city']==params['city']]['WATSTRESS'].values), line_width=3,
                    line_dash="dash", line_color="black",
                    annotation_text=params['city'],
                    annotation_font_size=10)

    fig.update_layout(height=800, title='Density distribution water stress cluster ' + params['city'])
    fig.update_xaxes(title_text='Water Stress')
    fig.update_yaxes(title_text='Density')

    return fig

#List of cities for app
cities = list(df_water.index)
cities_df= pd.DataFrame({
'c' : cities
})