add pair precision recall stats

yushangdi · yushangdi · commit eee3e48be23b · 2024-09-25T18:42:53.000Z
diff --git a/configs_experiments/ngrams/cluster_ngraphs_high_res_pcbs.config b/configs_experiments/ngrams/cluster_ngraphs_high_res_pcbs.config
@@ -42,5 +42,5 @@ ParallelAffinityClusterer:
 
 ParHacClusterer:
   parhac_clusterer_config:
-    weight_threshold: 0.5; 0.7320566343659267; 0.8564127056253706; 0.8833708760578991; 0.9052677145931002; 0.9230534741659427
+    weight_threshold: 0.5; 0.7320566343659267; 0.82; 0.84; 0.86; 0.88; 0.9; 0.92; 0.94; 0.96; 0.98
     epsilon: 0.01;0.1;1
diff --git a/configs_experiments/ngrams/stats_pair_ngrams.config b/configs_experiments/ngrams/stats_pair_ngrams.config
@@ -0,0 +1,6 @@
+Input communities: clusters.pair.cmty
+Deterministic: false
+
+statistics_config:
+  precision_recall_pair_threshold: 0.92
+  f_score_param: 0.5
diff --git a/stats.py b/stats.py
@@ -9,6 +9,8 @@
 import json
 import pandas as pd
 
+from stats_precision_recall_pair import compute_precision_recall_pair
+
 def getRunTime(clusterer, out_prefix):
   cluster_time = -1
   out_filename = out_prefix + ".out"
@@ -42,13 +44,18 @@ def getRunTime(clusterer, out_prefix):
 
 def runStats(out_prefix, graph, graph_idx, stats_dict):
   out_statistics = out_prefix + ".stats"
+  out_statistics_pair = out_prefix + ".pair.stats"
   in_clustering = out_prefix + ".cluster"
   if not os.path.exists(in_clustering) or not os.path.getsize(in_clustering) > 0:
     # Either an error or a timeout happened
     runner_utils.appendToFile("ERROR", out_statistics)
     return
   use_input_graph = runner_utils.input_directory + graph
-  use_input_communities = "" if not runner_utils.communities else "--input_communities=" + runner_utils.input_directory + runner_utils.communities[graph_idx]
+  input_communities = runner_utils.input_directory + runner_utils.communities[graph_idx]
+  if "precision_recall_pair_threshold" in runner_utils.stats_config:
+    compute_precision_recall_pair(in_clustering, input_communities, out_statistics_pair, runner_utils.stats_config, stats_dict)
+    return
+  use_input_communities = "" if not runner_utils.communities else "--input_communities=" + input_communities
   ss = ("bazel run //clusterers:stats-in-memory_main -- "
   "--input_graph=" + use_input_graph + " "
   "--is_gbbs_format=" + runner_utils.gbbs_format + " "
diff --git a/stats_precision_recall_pair.py b/stats_precision_recall_pair.py
@@ -0,0 +1,149 @@
+
+import logging
+import sys
+import json
+
+def _config_str_to_dict(input_str):
+  # e.g. 
+  # input_str = "precision_recall_pair_threshold: 0.92,f_score_param: 0.5"
+  # result_dict = {'precision_recall_pair_threshold': 0.92, 'f_score_param': 0.5}
+
+
+  # Split the string into key-value pairs
+  pairs = input_str.split(',')
+
+  # Initialize an empty dictionary
+  result_dict = {}
+
+  # Process each key-value pair
+  for pair in pairs:
+      # Split the pair by the colon
+      key, value = pair.split(':')
+      # Remove any leading/trailing whitespace
+      key = key.strip()
+      value = value.strip()
+      # Convert the value to float if possible
+      try:
+          value = float(value)
+      except ValueError:
+          pass  # Keep the value as a string if it cannot be converted
+      # Add the key-value pair to the dictionary
+      result_dict[key] = value
+
+  return result_dict
+
+
+def read_clusters(cluster_file):
+    """
+    Reads the clusters from a file and returns a dictionary mapping node IDs to a set of cluster IDs.
+    """
+    node_to_clusters = {}
+    with open(cluster_file, 'r') as f:
+        for cluster_id, line in enumerate(f):
+            nodes = line.strip().split("\t")
+            for node in nodes:
+                node = node.strip()
+                if node:
+                    if node not in node_to_clusters:
+                        node_to_clusters[node] = set()
+                    node_to_clusters[node].add(cluster_id)
+    return node_to_clusters
+
+def read_ground_truth_pairs(ground_truth_file):
+    """
+    Reads the ground truth pairs from a file and returns a list of tuples (node1, node2, weight).
+    """
+    pairs = []
+    with open(ground_truth_file, 'r') as f:
+        for line in f:
+            parts = line.strip().split('\t')
+            if len(parts) >= 3:
+                node1 = parts[0].strip()
+                node2 = parts[1].strip()
+                try:
+                    weight = float(parts[2].strip())
+                except ValueError:
+                    print(f"Invalid weight '{parts[2]}' in line: {line.strip()}")
+                    continue
+                pairs.append((node1, node2, weight))
+            else:
+                print(f"Ignoring invalid line: {line.strip()}")
+    return pairs
+
+def compute_precision_recall(node_to_clusters, pairs, threshold):
+    """
+    Computes precision and recall based on the clusters and ground truth pairs.
+    Handles overlapping clusters where a node can belong to multiple clusters.
+
+    node_to_clusters: map from node id to a set of clusters
+    pairs: list of (u,v,w) triplets
+    """
+    TP = 0  # True Positive
+    TN = 0  # True Negative
+    FP = 0  # False Positive
+    FN = 0  # False Negative
+
+    for node1, node2, weight in pairs:
+        # Determine if the pair is positive or negative
+        is_positive = weight > threshold
+
+        # Determine if the nodes are in the same cluster (overlapping clusters)
+        if node1 in node_to_clusters and node2 in node_to_clusters:
+            clusters1 = node_to_clusters[node1]
+            clusters2 = node_to_clusters[node2]
+            in_same_cluster = bool(clusters1 & clusters2)  # Check for non-empty intersection
+        else:
+            logging.warning("skipping nodes %s, %s", node1, node2)
+            # Nodes not found in clusters; skip this pair
+            continue
+
+        if is_positive:
+            if in_same_cluster:
+                TP += 1
+            else:
+                FN += 1
+        else:
+            if not in_same_cluster:
+                TN += 1
+            else:
+                FP += 1
+
+    # Calculate precision and recall
+    precision = TP / (TP + FP) if (TP + FP) > 0 else 0
+    recall = TP / (TP + FN) if (TP + FN) > 0 else 0
+
+    return precision, recall, TP, FP, TN, FN
+
+
+def compute_precision_recall_pair(in_clustering, input_communities, out_statistics, stats_config, stats_dict):
+  """
+  Compute pair precision and recall, and record result into stats_dict
+  """
+  stats_config = _config_str_to_dict(stats_config)
+  precision_recall_pair_threshold = stats_config["precision_recall_pair_threshold"]
+  f_score_param = stats_config.get("f_score_param", 1)
+  print()
+  print("clustering file", in_clustering)
+  print("community file", input_communities)
+  print("stats file", out_statistics)
+  print("parameters, ", precision_recall_pair_threshold, f_score_param)
+
+  # Read clusters and ground truth pairs
+  clusters = read_clusters(in_clustering)
+  pairs = read_ground_truth_pairs(input_communities)
+
+  # Compute precision and recall
+  precision, recall, TP, FP, TN, FN = compute_precision_recall(clusters, pairs, precision_recall_pair_threshold)
+
+  f_score = 0
+  if precision !=0 and recall != 0:
+    f_score = (1 + f_score_param * f_score_param) * precision * recall / ((f_score_param * f_score_param * precision) + recall)
+
+  stats_dict["fScore_mean"] = f_score
+  stats_dict["communityPrecision_mean"] = precision
+  stats_dict["communityRecall_mean"] = recall
+  stats_dict["PrecisionRecallPairThreshold"] = precision_recall_pair_threshold
+  stats_dict["fScoreParam"] = f_score_param
+
+  with open(out_statistics, 'w', encoding='utf-8') as f:
+    json.dump(stats_dict, f, indent=4)