multiple thresholds

yushangdi · yushangdi · commit 7a6d09ca17c2 · 2024-09-25T20:41:05.000Z
diff --git a/configs_experiments/ngrams/cluster_ngraphs_high_res_pcbs.config b/configs_experiments/ngrams/cluster_ngraphs_high_res_pcbs.config
@@ -1,6 +1,6 @@
 Input directory: /home/sy/ParClusterers/ngrams_graphs/NGramsGraphs/
 Output directory: /home/sy/ParClusterers/results/out_ngrams_high_res_pcbs/
-CSV output directory: /home/sy/ParClusterers/results/out_ngrams_high_res_0.92_pcbs_csv/
+CSV output directory: /home/sy/ParClusterers/results/out_ngrams_high_res_pcbs_csv/
 Clusterers: LDDClusterer;TectonicClusterer;ParallelAffinityClusterer;ParHacClusterer;ParallelCorrelationClusterer;ParallelModularityClusterer
 Graphs: ngrams.graph.gbbs
 GBBS format: true
diff --git a/configs_experiments/ngrams/cluster_ngraphs_pcbs.config b/configs_experiments/ngrams/cluster_ngraphs_pcbs.config
@@ -1,6 +1,6 @@
 Input directory: /home/sy/ParClusterers/ngrams_graphs/NGramsGraphs/
 Output directory: /home/sy/ParClusterers/results/out_ngrams_pcbs/
-CSV output directory: /home/sy/ParClusterers/results/out_ngrams_0.88_pcbs_csv/
+CSV output directory: /home/sy/ParClusterers/results/out_ngrams_pcbs_csv/
 Clusterers: LDDClusterer;ScanClusterer;LabelPropagationClusterer;SLPAClusterer;TectonicClusterer;ConnectivityClusterer;ParallelAffinityClusterer;ParHacClusterer;ParallelCorrelationClusterer;ParallelModularityClusterer
 Graphs: ngrams.graph.gbbs
 GBBS format: true
diff --git a/configs_experiments/ngrams/stats_ngrams.config b/configs_experiments/ngrams/stats_ngrams.config
diff --git a/configs_experiments/ngrams/stats_pair_ngrams.config b/configs_experiments/ngrams/stats_pair_ngrams.config
@@ -2,5 +2,5 @@ Input communities: clusters.pair.cmty
 Deterministic: false
 
 statistics_config:
-  precision_recall_pair_threshold: 0.92
+  precision_recall_pair_thresholds: 0.86;0.88;0.90;0.92;0.94
   f_score_param: 0.5
diff --git a/plotting/plot_pareto_ngrams.py b/plotting/plot_pareto_ngrams.py
@@ -7,6 +7,8 @@
 import numpy as np
 from plotting_utils import *
 
+import ast
+
 plt.rcParams["ps.useafm"] = True
 plt.rcParams["pdf.use14corefonts"] = True
 # plt.rcParams["text.usetex"] = True
@@ -418,14 +420,16 @@ def getAUCTable(df, df_pr_pareto, print_table=False):
 
 
 def plot_ngrams():
-    threshold = 0.92
-    df_pcbs = pd.read_csv(base_addr + f"out_ngrams_{threshold}_pcbs_csv/stats.csv")
-    df_pcbs_high_res = pd.read_csv(base_addr + f"out_ngrams_high_res_{threshold}_pcbs_csv/stats.csv")
-    df = pd.concat([df_pcbs, df_pcbs_high_res])
+    # df_pcbs = pd.read_csv(base_addr + f"out_ngrams_pcbs_csv/stats.csv")
+    df_pcbs_high_res = pd.read_csv(base_addr + f"out_ngrams_high_res_pcbs_csv/stats.csv")
+    df = pd.concat([df_pcbs_high_res]) #df_pcbs, 
 
     df = df.dropna(how="all")
     replace_graph_names(df)
     df = add_epsilon_to_hac(df)
+    df["fScore_mean"] = df["fScore_mean"].apply(ast.literal_eval)
+    df["communityPrecision_mean"] = df["communityPrecision_mean"].apply(ast.literal_eval)
+    df["communityRecall_mean"] = df["communityRecall_mean"].apply(ast.literal_eval)
 
     our_methods = [
         "KCoreClusterer",
@@ -443,25 +447,32 @@ def plot_ngrams():
         "ParHACClusterer_1",
     ]
 
-    df_pcbs = df[df["Clusterer Name"].isin(our_methods)]
-
-    # Get AUC table
-    df_pr_pareto = FilterParetoPRMethod(df_pcbs)
-    getAUCTable(df_pcbs, df_pr_pareto)
-
-    # Plot Precision Recall Pareto frontier for PCBS methods
-    axes = plotPRPareto(df_pr_pareto, only_high_p=True)
-    axes[0].set_ylim((0.5, 0.8))
-    plt.savefig(base_addr + f"pr_uci_{threshold}.pdf", bbox_inches="tight")
-    print("plotted pr_uci.pdf")
-
-    # Plot F_0.5 runtime Pareto frontier for PCBS methods
-    clusterers = df_pcbs["Clusterer Name"].unique()
-    dfs, graphs = GetParetoDfs(df_pcbs)
-    plotPareto(dfs, graphs, clusterers)
-    plt.tight_layout()
-    plt.savefig(base_addr + f"time_f1_uci_{threshold}.pdf", bbox_inches="tight")
-    print("plotted time_f1_uci.pdf")
+
+    thresholds = [0.86, 0.88, 0.90, 0.92, 0.94]
+
+    for threshold in thresholds:
+        df_pcbs = df[df["Clusterer Name"].isin(our_methods)]
+        
+        df_pcbs["fScore_mean"] = df["fScore_mean"].apply(lambda k: k[threshold])
+        df_pcbs["communityPrecision_mean"] = df["communityPrecision_mean"].apply(lambda k: k[threshold])
+        df_pcbs["communityRecall_mean"] = df["communityRecall_mean"].apply(lambda k: k[threshold])
+
+        # Get AUC table
+        df_pr_pareto = FilterParetoPRMethod(df_pcbs)
+        getAUCTable(df_pcbs, df_pr_pareto)
+
+        # Plot Precision Recall Pareto frontier for PCBS methods
+        axes = plotPRPareto(df_pr_pareto, only_high_p=True) #
+        plt.savefig(base_addr + f"pr_uci_{threshold}.pdf", bbox_inches="tight")
+        print("plotted pr_uci.pdf")
+
+        # Plot F_0.5 runtime Pareto frontier for PCBS methods
+        clusterers = df_pcbs["Clusterer Name"].unique()
+        dfs, graphs = GetParetoDfs(df_pcbs)
+        plotPareto(dfs, graphs, clusterers)
+        plt.tight_layout()
+        plt.savefig(base_addr + f"time_f1_uci_{threshold}.pdf", bbox_inches="tight")
+        print("plotted time_f1_uci.pdf")
 
 if __name__ == "__main__":
     base_addr = "results/"
diff --git a/stats.py b/stats.py
@@ -52,7 +52,7 @@ def runStats(out_prefix, graph, graph_idx, stats_dict):
     return
   use_input_graph = runner_utils.input_directory + graph
   input_communities = runner_utils.input_directory + runner_utils.communities[graph_idx]
-  if "precision_recall_pair_threshold" in runner_utils.stats_config:
+  if "precision_recall_pair_thresholds" in runner_utils.stats_config:
     compute_precision_recall_pair(in_clustering, input_communities, out_statistics_pair, runner_utils.stats_config, stats_dict)
     return
   use_input_communities = "" if not runner_utils.communities else "--input_communities=" + input_communities
diff --git a/stats_precision_recall_pair.py b/stats_precision_recall_pair.py
@@ -5,8 +5,8 @@
 
 def _config_str_to_dict(input_str):
   # e.g. 
-  # input_str = "precision_recall_pair_threshold: 0.92,f_score_param: 0.5"
-  # result_dict = {'precision_recall_pair_threshold': 0.92, 'f_score_param': 0.5}
+  # input_str = "precision_recall_pair_thresholds: 0.86;0.88;0.90;0.92;0.94,f_score_param: 0.5"
+  # result_dict = {'precision_recall_pair_thresholds': [0.86;0.88;0.90;0.92;0.94], 'f_score_param': 0.5}
 
 
   # Split the string into key-value pairs
@@ -24,12 +24,15 @@ def _config_str_to_dict(input_str):
       value = value.strip()
       # Convert the value to float if possible
       try:
-          value = float(value)
+          if key == "precision_recall_pair_thresholds":
+            # Split the value by semicolons and convert each to float
+            value = [float(v.strip()) for v in value.split(';')]
+          else:
+            value = float(value)
       except ValueError:
           pass  # Keep the value as a string if it cannot be converted
       # Add the key-value pair to the dictionary
       result_dict[key] = value
-
   return result_dict
 
 
@@ -70,7 +73,7 @@ def read_ground_truth_pairs(ground_truth_file):
                 print(f"Ignoring invalid line: {line.strip()}")
     return pairs
 
-def compute_precision_recall(node_to_clusters, pairs, threshold):
+def compute_precision_recall(node_to_clusters, pairs, thresholds, f_score_param):
     """
     Computes precision and recall based on the clusters and ground truth pairs.
     Handles overlapping clusters where a node can belong to multiple clusters.
@@ -83,66 +86,74 @@ def compute_precision_recall(node_to_clusters, pairs, threshold):
     FP = 0  # False Positive
     FN = 0  # False Negative
 
-    for node1, node2, weight in pairs:
-        # Determine if the pair is positive or negative
-        is_positive = weight > threshold
-
-        # Determine if the nodes are in the same cluster (overlapping clusters)
-        if node1 in node_to_clusters and node2 in node_to_clusters:
-            clusters1 = node_to_clusters[node1]
-            clusters2 = node_to_clusters[node2]
-            in_same_cluster = bool(clusters1 & clusters2)  # Check for non-empty intersection
-        else:
-            logging.warning("skipping nodes %s, %s", node1, node2)
-            # Nodes not found in clusters; skip this pair
-            continue
-
-        if is_positive:
-            if in_same_cluster:
-                TP += 1
-            else:
-                FN += 1
-        else:
-            if not in_same_cluster:
-                TN += 1
-            else:
-                FP += 1
-
-    # Calculate precision and recall
-    precision = TP / (TP + FP) if (TP + FP) > 0 else 0
-    recall = TP / (TP + FN) if (TP + FN) > 0 else 0
-
-    return precision, recall, TP, FP, TN, FN
+    precisions = {}
+    recalls = {}    
+    f_scores = {}
+
+    for threshold in thresholds:
+      for node1, node2, weight in pairs:
+          # Determine if the pair is positive or negative
+          is_positive = weight > threshold
+
+          # Determine if the nodes are in the same cluster (overlapping clusters)
+          if node1 in node_to_clusters and node2 in node_to_clusters:
+              clusters1 = node_to_clusters[node1]
+              clusters2 = node_to_clusters[node2]
+              in_same_cluster = bool(clusters1 & clusters2)  # Check for non-empty intersection
+          else:
+              logging.warning("skipping nodes %s, %s", node1, node2)
+              # Nodes not found in clusters; skip this pair
+              continue
+
+          if is_positive:
+              if in_same_cluster:
+                  TP += 1
+              else:
+                  FN += 1
+          else:
+              if not in_same_cluster:
+                  TN += 1
+              else:
+                  FP += 1
+
+      # Calculate precision and recall
+      precision = TP / (TP + FP) if (TP + FP) > 0 else 0
+      recall = TP / (TP + FN) if (TP + FN) > 0 else 0
+      f_score = 0
+      if precision !=0 and recall != 0:
+        f_score = (1 + f_score_param * f_score_param) * precision * recall / ((f_score_param * f_score_param * precision) + recall)
+
+      precisions[threshold] = precision
+      recalls[threshold] = recall
+      f_scores[threshold] = f_score
+
+    return precisions, recalls, f_scores
 
 
 def compute_precision_recall_pair(in_clustering, input_communities, out_statistics, stats_config, stats_dict):
   """
   Compute pair precision and recall, and record result into stats_dict
   """
   stats_config = _config_str_to_dict(stats_config)
-  precision_recall_pair_threshold = stats_config["precision_recall_pair_threshold"]
+  precision_recall_pair_thresholds = stats_config["precision_recall_pair_thresholds"]
   f_score_param = stats_config.get("f_score_param", 1)
   print()
   print("clustering file", in_clustering)
   print("community file", input_communities)
   print("stats file", out_statistics)
-  print("parameters, ", precision_recall_pair_threshold, f_score_param)
+  print("parameters, ", precision_recall_pair_thresholds, f_score_param)
 
   # Read clusters and ground truth pairs
   clusters = read_clusters(in_clustering)
   pairs = read_ground_truth_pairs(input_communities)
 
   # Compute precision and recall
-  precision, recall, TP, FP, TN, FN = compute_precision_recall(clusters, pairs, precision_recall_pair_threshold)
-
-  f_score = 0
-  if precision !=0 and recall != 0:
-    f_score = (1 + f_score_param * f_score_param) * precision * recall / ((f_score_param * f_score_param * precision) + recall)
+  precisions, recalls, f_scores = compute_precision_recall(clusters, pairs, precision_recall_pair_thresholds, f_score_param)
 
-  stats_dict["fScore_mean"] = f_score
-  stats_dict["communityPrecision_mean"] = precision
-  stats_dict["communityRecall_mean"] = recall
-  stats_dict["PrecisionRecallPairThreshold"] = precision_recall_pair_threshold
+  stats_dict["fScore_mean"] = f_scores
+  stats_dict["communityPrecision_mean"] = precisions
+  stats_dict["communityRecall_mean"] = recalls
+  stats_dict["PrecisionRecallPairThresholds"] = precision_recall_pair_thresholds
   stats_dict["fScoreParam"] = f_score_param
 
   with open(out_statistics, 'w', encoding='utf-8') as f: