change ec2

pohaoc2 · pohaoc2 · commit adbc8de26ab6 · 2025-02-09T15:31:28.000-08:00
diff --git a/sandbox/src/approximate_bayesian.py b/sandbox/src/approximate_bayesian.py
@@ -51,7 +51,8 @@ def main():
     _, bins, patch = ax[0].hist(y_sims, bins=20)
     ax[0].set_title("Prior - Activity")
     ax[0].set_xlim([-1, 1])
-
+    ax[0].set_xlabel("Activity")
+    ax[0].set_ylabel("Number of samples")
     y_obs = 0.25
     print(f"Number of samples: {len(data)}")
     epsilon = 0.25
@@ -61,15 +62,17 @@ def main():
     print(f"Number of accepted samples: {len(posterior_samples)}")
     # Plot the accepted samples
     ax[1].hist(posterior_samples, bins=bins)
-    ax[1].set_title("Posterior - Activity")
+    ax[1].set_title("Posterior - Activity (ABC)")
     ax[1].axvline(x=y_obs, color="red", linestyle="--", label="Observed")
     # Plot eplison
     ax[1].axvline(x=y_obs + epsilon, color="black", linestyle="--", label="Epsilon")
     ax[1].axvline(x=y_obs - epsilon, color="black", linestyle="--")
     ax[1].legend()
     ax[1].set_xlim([-1, 1])
+    ax[1].set_xlabel("Activity")
+
     plt.tight_layout()
-    plt.savefig("posterior_samples.png")
+    plt.savefig("posterior_abc.png")
 
 if __name__ == "__main__":
     main()
diff --git a/sandbox/src/gp.py b/sandbox/src/gp.py
@@ -5,9 +5,10 @@
 import matplotlib.pyplot as plt
 from sklearn.gaussian_process import GaussianProcessRegressor
 from sklearn.gaussian_process.kernels import RBF, Matern, WhiteKernel, ConstantKernel as C
-from sklearn.model_selection import train_test_split, KFold
 from sklearn.metrics import mean_squared_error, r2_score
+from sklearn.model_selection import train_test_split
 from sklearn.preprocessing import StandardScaler
+from sklearn.decomposition import PCA
 
 def plot_parity_with_uncertainty(
     y_true_train, y_pred_train, y_std_train, 
@@ -88,24 +89,6 @@ def plot_parity_with_uncertainty(
     plt.savefig(filename)
 
 
-def clean_data(full_data, response):
-    """Handle missing or non-numeric data"""
-
-    # Remove rows with multiple components
-    full_data = full_data[full_data["COMPONENTS"] == 1]
-    full_data.reset_index(drop=True, inplace=True)
-
-    # Remove response rows with bad values
-    full_data = full_data.loc[~full_data[response].isin([np.nan, np.inf, -np.inf])]
-    full_data.reset_index(drop=True, inplace=True)
-
-    # Removed features columns with bad values
-    numeric_cols = full_data.select_dtypes(include=[np.number]).columns
-    full_data = full_data.loc[
-        :, ~(np.isnan(full_data[numeric_cols]).any(axis=0) | np.isinf(full_data[numeric_cols])).any(axis=0)
-    ]
-    return full_data
-
 OUTPUT_MAPPING = {"ACTIVITY": 0, "GROWTH": 1, "SYMMETRY": 2}
 
 # Load data
@@ -120,6 +103,7 @@ def clean_data(full_data, response):
     "AVG_DEGREE", "AVG_CLUSTERING", "AVG_CLOSENESS", 
     "AVG_BETWEENNESS", "AVG_CORENESS"
 ]
+features = ["RADIUS"]
 spatial_features = [
     "RADIUS", "LENGTH", "WALL", "SHEAR", "CIRCUM", "FLOW", 
     "NODES", "EDGES", "GRADIUS", "GDIAMETER", "AVG_ECCENTRICITY", 
@@ -182,7 +166,7 @@ def clean_data(full_data, response):
     y_train = scaler.fit_transform(y_train)
     y_test = scaler.transform(y_test)
     if train:
-        gp = GaussianProcessRegressor(kernel=kernel, n_restarts_optimizer=10, alpha=3e-1)
+        gp = GaussianProcessRegressor(kernel=kernel, n_restarts_optimizer=10, alpha=3e-6)
         gp.fit(X_train, y_train)
     else:
         gp = joblib.load('gp.pkl')
@@ -191,7 +175,48 @@ def clean_data(full_data, response):
 
     y_pred, y_pred_std = gp.predict(X_test, return_std=True)
     y_pred_train, y_pred_std_train = gp.predict(X_train, return_std=True)
-    # Convert back to original scale
+    # Plot GP prediction function with uncertainty
+    fig, ax = plt.subplots(1, 1, figsize=(6, 6))
+    plot_pca = False
+    if plot_pca:
+        pca = PCA(n_components=1)  # Focus on PC1 for plotting
+        pca_X_train = pca.fit_transform(X_train)
+        pca_X_test = pca.transform(X_test)
+        # Generate uniform points in the PC1 space
+        pc1_min, pc1_max = pca_X_train.min(), pca_X_train.max()
+        pc1_uniform = np.linspace(pc1_min, pc1_max, 50).reshape(-1, 1)
+        # Map uniform points back to the original feature space
+        X_uniform = pca.inverse_transform(pc1_uniform)
+        # Get GP predictions (mean and standard deviation) for the uniform points
+        y_p, y_std = gp.predict(X_uniform, return_std=True)
+
+        # Plot GP prediction with uncertainty
+        # Scatter plot for training data in PC1
+        print(pca_X_train.shape, y_train.shape)
+        ax.scatter(pca_X_train[:, 0], y_train[:, 0], label="Train Data", color="blue", alpha=0.6)
+        ax.scatter(pca_X_test[:, 0], y_test[:, 0], label="Test Data", color="green", alpha=0.6)
+
+        # GP prediction mean
+        ax.scatter(pc1_uniform, y_p[:, 0], label="GP Prediction", color="red", linewidth=2)
+        ax.plot(pc1_uniform, y_p[:, 0], label="GP Prediction", color="red", linewidth=2)
+        # Customize the plot
+        ax.set_title("GP Prediction with Uncertainty")
+        ax.set_xlabel("Principal Component 1 (PC1)")
+        ax.set_ylabel("Prediction")
+    else:
+        x = np.linspace(-3, 3, 1000).reshape(-1, 1)
+        y_p = gp.predict(x)
+        ax.scatter(X_train, y_train[:, 0], label="Train")
+        ax.scatter(X_test, y_test[:, 0], label="Test")
+        ax.plot(x, y_p[:, 0], label="Prediction", color="red")
+        ax.set_title("GP Prediction")
+        ax.set_xlabel("RADIUS")
+        ax.set_ylabel("Prediction")
+
+    ax.legend()
+
+    plt.tight_layout()
+    plt.savefig("gp.png")
     """
     y_pred = scaler.inverse_transform(y_pred)
     y_pred_std = scaler.inverse_transform(y_pred_std)
diff --git a/sandbox/src/mcmc.py b/sandbox/src/mcmc.py
@@ -2,6 +2,18 @@
 import pandas as pd
 import random
 import matplotlib.pyplot as plt
+from sklearn.preprocessing import StandardScaler
+from sklearn.decomposition import PCA
+from sklearn.preprocessing import LabelEncoder
+
+column_names = [
+    "KEY",
+    "RADIUS", "LENGTH", "WALL", "SHEAR", "CIRCUM", "FLOW", 
+    "NODES", "EDGES", "GRADIUS", "GDIAMETER", "AVG_ECCENTRICITY", 
+    "AVG_SHORTEST_PATH", "AVG_IN_DEGREES", "AVG_OUT_DEGREES", 
+    "AVG_DEGREE", "AVG_CLUSTERING", "AVG_CLOSENESS", 
+    "AVG_BETWEENNESS", "AVG_CORENESS"
+]
 
 # Define distance function based on the paper
 def distance_function(y_obs, y_sim, weight=1.0):
@@ -61,22 +73,27 @@ def mcmc(data, y_sims, y_obs, n_iterations, proposal_std=1.0):
             samples.append(np.append(current_theta, proposal_y_sim))
     # Remove duplicates in the samples
     #samples = list(set(tuple(row) for row in samples))
-    return pd.DataFrame(samples, columns=["NODES", "EDGES", "GRADIUS", "ACTIVITY"])
+    return pd.DataFrame(samples, columns= column_names + ["ACTIVITY"])
 
 def main():
     # Load ABM data
-    data_path = "../../data/ARCADE/C-feature_0.0_metric_15-04032023.csv"
+    data_path = "../../data/ARCADE/C-feature_15.0_metric_15-04032023.csv"
     data = pd.read_csv(data_path)
+    data = data[data["COMPONENTS"] == 1]
+    threshold = 0.2
+    columns_to_drop = [col for col in data.columns if ((data[col] == np.inf) | (data[col] == -np.inf)).mean() >= threshold]
+    data = data.drop(columns=columns_to_drop)
 
     # Extract inputs (theta) and outputs (y)
-    input_feature_names = ["NODES", "EDGES", "GRADIUS"]
+    input_feature_names = column_names #["NODES", "EDGES", "GRADIUS"]
     # input_feature_names = ["ACTIVITY"]
     predicted_output = ["ACTIVITY"]#, "GROWTH", "SYMMETRY"]
     input_features = data[input_feature_names].values
+    
     y_sims = data[predicted_output].values
 
     # Observed value
-    y_obs = [1]#, -10, 0]
+    y_obs = [0.25]#, -10, 0]
 
     # Run MCMC
     n_iterations = 10000
@@ -89,12 +106,64 @@ def main():
     print(f"Number of samples: {len(posterior_samples)}")
     print(posterior_samples.describe())
     # Plot the accepted samples activity
-    fig, ax = plt.subplots(1, 2, figsize=(10, 5))
+    fig, ax = plt.subplots(1, 3, figsize=(15, 5))
     _, bins, patch = ax[0].hist(y_sims, bins=20)
     ax[0].set_title("Prior - Activity")
-
+    ax[0].set_xlim([-1, 1])
+    ax[0].set_xlabel("Activity")
+    ax[0].set_ylabel("Number of samples")
     ax[1].hist(posterior_samples["ACTIVITY"], bins=bins)
-    ax[1].set_title("Posterior - Activity")
+    ax[1].set_title("Posterior - Activity (MCMC)")
+    ax[1].set_xlim([-1, 1])
+    ax[1].set_xlabel("Activity")
+    ax[1].axvline(y_obs[0], color="red", linestyle="--", label="Target activity")
+    ax[1].legend()
+
+    pca = PCA(n_components=2)
+    scaler = StandardScaler()
+    features = scaler.fit_transform(input_features[:, 1:])
+    label_encoder = LabelEncoder()
+    labels = label_encoder.fit_transform(input_features[:, 0])
+    reduced_features = pca.fit_transform(features)
+    categories = label_encoder.classes_
+    markers = ['o', 's', 'D', '^', 'v', '<', '>', 'p', '*', 'h', 'H', '+', 'x', 'd', '|', '_']
+    unique_labels = np.unique(labels)
+    cmap = plt.cm.viridis
+    # drop duplicates
+    posterior_samples = posterior_samples.drop_duplicates(subset=input_feature_names)
+    posterior_reduced_features = pca.transform(scaler.transform(posterior_samples[input_feature_names].values[:, 1:]))
+    posterior_labels = label_encoder.transform(posterior_samples[input_feature_names].values[:, 0])
+
+    for i, label in enumerate(unique_labels):
+        ax[2].scatter(reduced_features[labels == label, 0],
+                      reduced_features[labels == label, 1], 
+                      marker=markers[i % len(markers)],
+                      label=f"{categories[label]}", 
+                      facecolors='none',
+                      edgecolors=cmap(i / len(unique_labels))
+                      )
+        ax[2].scatter(posterior_reduced_features[posterior_labels == label, 0], 
+                      posterior_reduced_features[posterior_labels == label, 1],
+                      marker=markers[i % len(markers)],
+                      facecolors=cmap(i / len(unique_labels)),
+                      edgecolors='none', alpha=0.8
+                      )
+
+    # Create custom legends
+    handles1 = [plt.Line2D([0], [0], marker=markers[i % len(markers)], color='w', label=categories[label],
+                           markerfacecolor='none', markeredgecolor=cmap(i / len(unique_labels))) 
+                for i, label in enumerate(unique_labels)]
+    handles2 = [plt.Line2D([0], [0], marker='o', color='w', label='Prior', markerfacecolor='none', markeredgecolor='k'),
+                plt.Line2D([0], [0], marker='o', color='w', label='Posterior', markerfacecolor='k', markeredgecolor='none', alpha=0.5)]
+
+    legend1 = ax[2].legend(handles=handles1, title="Vasculature type", loc='upper right')
+    ax[2].add_artist(legend1)
+    ax[2].legend(handles=handles2, title="Distribution", loc='lower right')
+    ax[2].set_title("PCA - Vasculature distribution")
+    ax[2].set_xlabel("PC1")
+    ax[2].set_ylabel("PC2")
+    plt.tight_layout()
+
     plt.savefig("posterior_mcmc.png")
 
 if __name__ == "__main__":
diff --git a/sandbox/src/posterior_samples_mcmc.csv b/sandbox/src/posterior_samples_mcmc.csv