-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathproduction_global.py
More file actions
158 lines (119 loc) · 5.68 KB
/
production_global.py
File metadata and controls
158 lines (119 loc) · 5.68 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
# code for computing measures of spatial autocorrelation of random generated points
# for different grids and sampling rates, specifically, do othe following
# For i in I iterations (e.g. 1000)
# Create a random distribution of points
# Create spatial weights matrix
# For $\rho$ in `[0, 0.2, 0.4, 0.8, 1]`
# Use spatial autoregressive model $Y = (I - \rho W)^{-1} \epsilon$ to generate autocorrelated variable for each point
# Loop over each sample size `["3%","5%","10%","20%","50%","100%"]`
# and then over each grid size `["6x6","8x8","10x10","12x12","15x15"]`
# Aggregating the sampled points to each grid cell.
# Computing global spatial autocorrelation stats
# outputting data to a list
# save to a csv file
import numpy as np
import pandas as pd
import geopandas as gpd
import seaborn as sns
import shapely
import pysal
import csv
import time
# constat data gen variables
total_points = 10000
knn_for_W = 30
iterations = 1
# load in grid data (was generated in QGIS by hand)
grid_6 = gpd.read_file("grids/grid_6x6.geojson")
grid_8 = gpd.read_file("grids/grid_8x8.geojson")
grid_10 = gpd.read_file("grids/grid_10x10.geojson")
grid_12 = gpd.read_file("grids/grid_12x12.geojson")
grid_15 = gpd.read_file("grids/grid_15x15.geojson")
# setup lists for looping samples and grids
grids = [grid_6, grid_8, grid_10, grid_12, grid_15]
#grids = [grid_6]
samples = ["sample_03","sample_05","sample_10","sample_20","sample_50","sample_100"]
#samples = ["sample_100"]
sample_names = ["3%","5%","10%","20%","50%","100%"]
#sample_names = ["100%"]
grid_names = ["6x6","8x8","10x10","12x12","15x15"]
#grid_names = ["6x6"]
# timing
start_time = time.time()
# start iterating over iterations
i = 0
while i < iterations:
try:
# storing outputs in a simple python list
outputs = []
print(i, start_time - time.time())
# generate random location of points
xy = 1.2 * np.random.rand(total_points,2)
xydf = pd.DataFrame(data=xy)
xydf.columns = ['x', 'y']
# standard normal errors for each point
errors_ind = np.random.normal(0, 1, total_points)
xydf["errors"] = errors_ind
# an identiy matrix needed for generating simulated values at each point
I = np.identity(total_points)
# weights matrix for k nearest
kd = pysal.lib.cg.kdtree.KDTree(xy)
W = pysal.lib.weights.KNN(kd, knn_for_W)
W.transform = 'r' # row normalizing
# extract the sparse weights matrix as a full np array for matrix multiplication
W = (W.sparse)
W = (W.toarray())
for rho in [0, 0.2, 0.4, 0.8, 0.999]:
# spatial autoregression
# $Y = (I - \rho W)^{-1} \epsilon$
Y = np.matrix(I - rho * W).I.dot(errors_ind)
# append these Y values to the point data frame
xydf["sim"] = np.transpose(Y)
# use binomial distribution to select whether point is sampled
xydf["sample_03"] = np.random.binomial(1, 0.03, size=total_points)
xydf["sample_05"] = np.random.binomial(1, 0.05, size=total_points)
xydf["sample_10"] = np.random.binomial(1, 0.10, size=total_points)
xydf["sample_20"] = np.random.binomial(1, 0.20, size=total_points)
xydf["sample_50"] = np.random.binomial(1, 0.50, size=total_points)
xydf["sample_100"] = np.random.binomial(1, 1, size=total_points)
s = 0
for sample in samples:
# subset data by each sample
xydf_s = xydf.loc[(xydf[sample] == 1)]
# set up a geodataframe for this, to allow for future spatial join
geometry = [shapely.geometry.Point(xy) for xy in zip(xydf_s.x, xydf_s.y)]
gdf = gpd.GeoDataFrame(xydf_s, geometry=geometry)
g = 0
for grid in grids:
print(i, rho, sample_names[s], grid_names[g], start_time - time.time())
# spatial join the grid IDs to the point data
xy_with_grid = gpd.sjoin(gdf, grid, how="inner", op='intersects')
# generate means and proportions in each cell of the grid
grid_desc = xy_with_grid.groupby(['id']).agg({'errors': "count",'sim': "mean"})
# update some of the column names
grid_desc["mean"] = grid_desc["sim"]
del grid_desc['sim'], grid_desc['errors']
# join back to grid boundaries
grid_join = grid.merge(grid_desc, on='id')
# compute spatial weights matrix
Wg = pysal.lib.weights.Queen.from_dataframe(grid_join)
Wg.transform = 'r' # row normalizing
mi = pysal.explore.esda.Moran(np.array(grid_join["mean"]), Wg, two_tailed=False)
#YVar='mean'
#XVars=['id']
grid_join["var"] = np.random.normal(0, 1, len(grid_join))
Ym=grid_join['mean'].values.reshape((len(grid_join),1))
Xm=grid_join[['var']].values
mlag = pysal.model.spreg.ml_lag.ML_Lag(Ym,Xm,w=Wg,name_y='mean', name_x=['var'] )
# output the values
output = [i, rho, sample_names[s], grid_names[g], round(mi.I, 3), round(mi.p_norm, 3), round(mlag.rho,3), round(mlag.z_stat[2][1],3)]
outputs.append(output)
g += 1
s += 1
with open("outputs/" + str(i) + ".csv", "w") as csvfile:
writer = csv.writer(csvfile)
for row in outputs:
writer.writerow(row)
i += 1
except:
None