Skip to content

Commit adbc8de

Browse files
committed
change ec2
1 parent ad06048 commit adbc8de

File tree

4 files changed

+5615
-31
lines changed

4 files changed

+5615
-31
lines changed

sandbox/src/approximate_bayesian.py

Lines changed: 6 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -51,7 +51,8 @@ def main():
5151
_, bins, patch = ax[0].hist(y_sims, bins=20)
5252
ax[0].set_title("Prior - Activity")
5353
ax[0].set_xlim([-1, 1])
54-
54+
ax[0].set_xlabel("Activity")
55+
ax[0].set_ylabel("Number of samples")
5556
y_obs = 0.25
5657
print(f"Number of samples: {len(data)}")
5758
epsilon = 0.25
@@ -61,15 +62,17 @@ def main():
6162
print(f"Number of accepted samples: {len(posterior_samples)}")
6263
# Plot the accepted samples
6364
ax[1].hist(posterior_samples, bins=bins)
64-
ax[1].set_title("Posterior - Activity")
65+
ax[1].set_title("Posterior - Activity (ABC)")
6566
ax[1].axvline(x=y_obs, color="red", linestyle="--", label="Observed")
6667
# Plot eplison
6768
ax[1].axvline(x=y_obs + epsilon, color="black", linestyle="--", label="Epsilon")
6869
ax[1].axvline(x=y_obs - epsilon, color="black", linestyle="--")
6970
ax[1].legend()
7071
ax[1].set_xlim([-1, 1])
72+
ax[1].set_xlabel("Activity")
73+
7174
plt.tight_layout()
72-
plt.savefig("posterior_samples.png")
75+
plt.savefig("posterior_abc.png")
7376

7477
if __name__ == "__main__":
7578
main()

sandbox/src/gp.py

Lines changed: 46 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -5,9 +5,10 @@
55
import matplotlib.pyplot as plt
66
from sklearn.gaussian_process import GaussianProcessRegressor
77
from sklearn.gaussian_process.kernels import RBF, Matern, WhiteKernel, ConstantKernel as C
8-
from sklearn.model_selection import train_test_split, KFold
98
from sklearn.metrics import mean_squared_error, r2_score
9+
from sklearn.model_selection import train_test_split
1010
from sklearn.preprocessing import StandardScaler
11+
from sklearn.decomposition import PCA
1112

1213
def plot_parity_with_uncertainty(
1314
y_true_train, y_pred_train, y_std_train,
@@ -88,24 +89,6 @@ def plot_parity_with_uncertainty(
8889
plt.savefig(filename)
8990

9091

91-
def clean_data(full_data, response):
92-
"""Handle missing or non-numeric data"""
93-
94-
# Remove rows with multiple components
95-
full_data = full_data[full_data["COMPONENTS"] == 1]
96-
full_data.reset_index(drop=True, inplace=True)
97-
98-
# Remove response rows with bad values
99-
full_data = full_data.loc[~full_data[response].isin([np.nan, np.inf, -np.inf])]
100-
full_data.reset_index(drop=True, inplace=True)
101-
102-
# Removed features columns with bad values
103-
numeric_cols = full_data.select_dtypes(include=[np.number]).columns
104-
full_data = full_data.loc[
105-
:, ~(np.isnan(full_data[numeric_cols]).any(axis=0) | np.isinf(full_data[numeric_cols])).any(axis=0)
106-
]
107-
return full_data
108-
10992
OUTPUT_MAPPING = {"ACTIVITY": 0, "GROWTH": 1, "SYMMETRY": 2}
11093

11194
# Load data
@@ -120,6 +103,7 @@ def clean_data(full_data, response):
120103
"AVG_DEGREE", "AVG_CLUSTERING", "AVG_CLOSENESS",
121104
"AVG_BETWEENNESS", "AVG_CORENESS"
122105
]
106+
features = ["RADIUS"]
123107
spatial_features = [
124108
"RADIUS", "LENGTH", "WALL", "SHEAR", "CIRCUM", "FLOW",
125109
"NODES", "EDGES", "GRADIUS", "GDIAMETER", "AVG_ECCENTRICITY",
@@ -182,7 +166,7 @@ def clean_data(full_data, response):
182166
y_train = scaler.fit_transform(y_train)
183167
y_test = scaler.transform(y_test)
184168
if train:
185-
gp = GaussianProcessRegressor(kernel=kernel, n_restarts_optimizer=10, alpha=3e-1)
169+
gp = GaussianProcessRegressor(kernel=kernel, n_restarts_optimizer=10, alpha=3e-6)
186170
gp.fit(X_train, y_train)
187171
else:
188172
gp = joblib.load('gp.pkl')
@@ -191,7 +175,48 @@ def clean_data(full_data, response):
191175

192176
y_pred, y_pred_std = gp.predict(X_test, return_std=True)
193177
y_pred_train, y_pred_std_train = gp.predict(X_train, return_std=True)
194-
# Convert back to original scale
178+
# Plot GP prediction function with uncertainty
179+
fig, ax = plt.subplots(1, 1, figsize=(6, 6))
180+
plot_pca = False
181+
if plot_pca:
182+
pca = PCA(n_components=1) # Focus on PC1 for plotting
183+
pca_X_train = pca.fit_transform(X_train)
184+
pca_X_test = pca.transform(X_test)
185+
# Generate uniform points in the PC1 space
186+
pc1_min, pc1_max = pca_X_train.min(), pca_X_train.max()
187+
pc1_uniform = np.linspace(pc1_min, pc1_max, 50).reshape(-1, 1)
188+
# Map uniform points back to the original feature space
189+
X_uniform = pca.inverse_transform(pc1_uniform)
190+
# Get GP predictions (mean and standard deviation) for the uniform points
191+
y_p, y_std = gp.predict(X_uniform, return_std=True)
192+
193+
# Plot GP prediction with uncertainty
194+
# Scatter plot for training data in PC1
195+
print(pca_X_train.shape, y_train.shape)
196+
ax.scatter(pca_X_train[:, 0], y_train[:, 0], label="Train Data", color="blue", alpha=0.6)
197+
ax.scatter(pca_X_test[:, 0], y_test[:, 0], label="Test Data", color="green", alpha=0.6)
198+
199+
# GP prediction mean
200+
ax.scatter(pc1_uniform, y_p[:, 0], label="GP Prediction", color="red", linewidth=2)
201+
ax.plot(pc1_uniform, y_p[:, 0], label="GP Prediction", color="red", linewidth=2)
202+
# Customize the plot
203+
ax.set_title("GP Prediction with Uncertainty")
204+
ax.set_xlabel("Principal Component 1 (PC1)")
205+
ax.set_ylabel("Prediction")
206+
else:
207+
x = np.linspace(-3, 3, 1000).reshape(-1, 1)
208+
y_p = gp.predict(x)
209+
ax.scatter(X_train, y_train[:, 0], label="Train")
210+
ax.scatter(X_test, y_test[:, 0], label="Test")
211+
ax.plot(x, y_p[:, 0], label="Prediction", color="red")
212+
ax.set_title("GP Prediction")
213+
ax.set_xlabel("RADIUS")
214+
ax.set_ylabel("Prediction")
215+
216+
ax.legend()
217+
218+
plt.tight_layout()
219+
plt.savefig("gp.png")
195220
"""
196221
y_pred = scaler.inverse_transform(y_pred)
197222
y_pred_std = scaler.inverse_transform(y_pred_std)

sandbox/src/mcmc.py

Lines changed: 76 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,18 @@
22
import pandas as pd
33
import random
44
import matplotlib.pyplot as plt
5+
from sklearn.preprocessing import StandardScaler
6+
from sklearn.decomposition import PCA
7+
from sklearn.preprocessing import LabelEncoder
8+
9+
column_names = [
10+
"KEY",
11+
"RADIUS", "LENGTH", "WALL", "SHEAR", "CIRCUM", "FLOW",
12+
"NODES", "EDGES", "GRADIUS", "GDIAMETER", "AVG_ECCENTRICITY",
13+
"AVG_SHORTEST_PATH", "AVG_IN_DEGREES", "AVG_OUT_DEGREES",
14+
"AVG_DEGREE", "AVG_CLUSTERING", "AVG_CLOSENESS",
15+
"AVG_BETWEENNESS", "AVG_CORENESS"
16+
]
517

618
# Define distance function based on the paper
719
def distance_function(y_obs, y_sim, weight=1.0):
@@ -61,22 +73,27 @@ def mcmc(data, y_sims, y_obs, n_iterations, proposal_std=1.0):
6173
samples.append(np.append(current_theta, proposal_y_sim))
6274
# Remove duplicates in the samples
6375
#samples = list(set(tuple(row) for row in samples))
64-
return pd.DataFrame(samples, columns=["NODES", "EDGES", "GRADIUS", "ACTIVITY"])
76+
return pd.DataFrame(samples, columns= column_names + ["ACTIVITY"])
6577

6678
def main():
6779
# Load ABM data
68-
data_path = "../../data/ARCADE/C-feature_0.0_metric_15-04032023.csv"
80+
data_path = "../../data/ARCADE/C-feature_15.0_metric_15-04032023.csv"
6981
data = pd.read_csv(data_path)
82+
data = data[data["COMPONENTS"] == 1]
83+
threshold = 0.2
84+
columns_to_drop = [col for col in data.columns if ((data[col] == np.inf) | (data[col] == -np.inf)).mean() >= threshold]
85+
data = data.drop(columns=columns_to_drop)
7086

7187
# Extract inputs (theta) and outputs (y)
72-
input_feature_names = ["NODES", "EDGES", "GRADIUS"]
88+
input_feature_names = column_names #["NODES", "EDGES", "GRADIUS"]
7389
# input_feature_names = ["ACTIVITY"]
7490
predicted_output = ["ACTIVITY"]#, "GROWTH", "SYMMETRY"]
7591
input_features = data[input_feature_names].values
92+
7693
y_sims = data[predicted_output].values
7794

7895
# Observed value
79-
y_obs = [1]#, -10, 0]
96+
y_obs = [0.25]#, -10, 0]
8097

8198
# Run MCMC
8299
n_iterations = 10000
@@ -89,12 +106,64 @@ def main():
89106
print(f"Number of samples: {len(posterior_samples)}")
90107
print(posterior_samples.describe())
91108
# Plot the accepted samples activity
92-
fig, ax = plt.subplots(1, 2, figsize=(10, 5))
109+
fig, ax = plt.subplots(1, 3, figsize=(15, 5))
93110
_, bins, patch = ax[0].hist(y_sims, bins=20)
94111
ax[0].set_title("Prior - Activity")
95-
112+
ax[0].set_xlim([-1, 1])
113+
ax[0].set_xlabel("Activity")
114+
ax[0].set_ylabel("Number of samples")
96115
ax[1].hist(posterior_samples["ACTIVITY"], bins=bins)
97-
ax[1].set_title("Posterior - Activity")
116+
ax[1].set_title("Posterior - Activity (MCMC)")
117+
ax[1].set_xlim([-1, 1])
118+
ax[1].set_xlabel("Activity")
119+
ax[1].axvline(y_obs[0], color="red", linestyle="--", label="Target activity")
120+
ax[1].legend()
121+
122+
pca = PCA(n_components=2)
123+
scaler = StandardScaler()
124+
features = scaler.fit_transform(input_features[:, 1:])
125+
label_encoder = LabelEncoder()
126+
labels = label_encoder.fit_transform(input_features[:, 0])
127+
reduced_features = pca.fit_transform(features)
128+
categories = label_encoder.classes_
129+
markers = ['o', 's', 'D', '^', 'v', '<', '>', 'p', '*', 'h', 'H', '+', 'x', 'd', '|', '_']
130+
unique_labels = np.unique(labels)
131+
cmap = plt.cm.viridis
132+
# drop duplicates
133+
posterior_samples = posterior_samples.drop_duplicates(subset=input_feature_names)
134+
posterior_reduced_features = pca.transform(scaler.transform(posterior_samples[input_feature_names].values[:, 1:]))
135+
posterior_labels = label_encoder.transform(posterior_samples[input_feature_names].values[:, 0])
136+
137+
for i, label in enumerate(unique_labels):
138+
ax[2].scatter(reduced_features[labels == label, 0],
139+
reduced_features[labels == label, 1],
140+
marker=markers[i % len(markers)],
141+
label=f"{categories[label]}",
142+
facecolors='none',
143+
edgecolors=cmap(i / len(unique_labels))
144+
)
145+
ax[2].scatter(posterior_reduced_features[posterior_labels == label, 0],
146+
posterior_reduced_features[posterior_labels == label, 1],
147+
marker=markers[i % len(markers)],
148+
facecolors=cmap(i / len(unique_labels)),
149+
edgecolors='none', alpha=0.8
150+
)
151+
152+
# Create custom legends
153+
handles1 = [plt.Line2D([0], [0], marker=markers[i % len(markers)], color='w', label=categories[label],
154+
markerfacecolor='none', markeredgecolor=cmap(i / len(unique_labels)))
155+
for i, label in enumerate(unique_labels)]
156+
handles2 = [plt.Line2D([0], [0], marker='o', color='w', label='Prior', markerfacecolor='none', markeredgecolor='k'),
157+
plt.Line2D([0], [0], marker='o', color='w', label='Posterior', markerfacecolor='k', markeredgecolor='none', alpha=0.5)]
158+
159+
legend1 = ax[2].legend(handles=handles1, title="Vasculature type", loc='upper right')
160+
ax[2].add_artist(legend1)
161+
ax[2].legend(handles=handles2, title="Distribution", loc='lower right')
162+
ax[2].set_title("PCA - Vasculature distribution")
163+
ax[2].set_xlabel("PC1")
164+
ax[2].set_ylabel("PC2")
165+
plt.tight_layout()
166+
98167
plt.savefig("posterior_mcmc.png")
99168

100169
if __name__ == "__main__":

0 commit comments

Comments
 (0)