diff --git a/clusterers/clustering_stats.cc b/clusterers/clustering_stats.cc index 8eed452..9d251be 100644 --- a/clusterers/clustering_stats.cc +++ b/clusterers/clustering_stats.cc @@ -71,14 +71,20 @@ absl::StatusOr GetStats(const GbbsGraph& graph, ComputeEdgeDensity(graph, clustering, &clustering_stats, cluster_ids, clustering_stats_config); auto end_edge_density = std::chrono::steady_clock::now(); PrintTime(end_diameter, end_edge_density, "Compute EdgeDensity"); + ComputeEdgeDensityOverlap(graph, clustering, &clustering_stats, cluster_ids, clustering_stats_config); + auto end_edge_density_overlap = std::chrono::steady_clock::now(); + PrintTime(end_edge_density, end_edge_density_overlap, "Compute EdgeDensityOverlap"); ComputeTriangleDensity(graph, clustering, &clustering_stats, cluster_ids, clustering_stats_config); auto end_triangle_density = std::chrono::steady_clock::now(); - PrintTime(end_edge_density, end_triangle_density, "Compute Triangle Density"); + PrintTime(end_edge_density_overlap, end_triangle_density, "Compute Triangle Density"); + ComputeTriangleDensityOverlap(graph, clustering, &clustering_stats, cluster_ids, clustering_stats_config); + auto end_triangle_density_overlap = std::chrono::steady_clock::now(); + PrintTime(end_triangle_density, end_triangle_density_overlap, "Compute Triangle Density"); size_t n = graph.Graph()->n; ComputeARI(n, clustering, &clustering_stats, communities, clustering_stats_config); auto end_ari = std::chrono::steady_clock::now(); - PrintTime(end_triangle_density, end_ari, "Compute ARI"); + PrintTime(end_triangle_density_overlap, end_ari, "Compute ARI"); ComputeNMI(n, clustering, &clustering_stats, communities, clustering_stats_config); auto end_nmi = std::chrono::steady_clock::now(); PrintTime(end_ari, end_nmi, "Compute NMI"); diff --git a/clusterers/clustering_stats.proto b/clusterers/clustering_stats.proto index 2557466..f835a80 100644 --- a/clusterers/clustering_stats.proto +++ b/clusterers/clustering_stats.proto @@ -14,6 +14,8 @@ message ClusteringStatsConfig { optional bool compute_precision_recall = 9; optional bool compute_nmi = 10; optional double f_score_param = 11; + optional bool compute_edge_density_overlap = 12; + optional bool compute_triangle_density_overlap = 13; } message DistributionStats { @@ -44,4 +46,6 @@ message ClusteringStatistics { optional double f_score_param = 31; optional double weighted_edge_density_mean = 32; optional double weighted_triangle_density_mean = 33; + optional double weighted_edge_density_overlap_mean = 34; + optional double weighted_triangle_density_overlap_mean = 35; } \ No newline at end of file diff --git a/clusterers/stats/stats_density.h b/clusterers/stats/stats_density.h index ab00d97..b9aa504 100644 --- a/clusterers/stats/stats_density.h +++ b/clusterers/stats/stats_density.h @@ -74,6 +74,72 @@ inline absl::Status ComputeEdgeDensity(const GbbsGraph& graph, return absl::OkStatus(); } +// compute the edge density of each cluster +// edge density is the number of edges divided by the number of possible edges +inline absl::Status ComputeEdgeDensityOverlap(const GbbsGraph& graph, + const InMemoryClusterer::Clustering& clustering, ClusteringStatistics* clustering_stats, + const parlay::sequence& cluster_ids, const ClusteringStatsConfig& clustering_stats_config) { + const bool compute_edge_density_overlap = clustering_stats_config.compute_edge_density_overlap(); + if (!compute_edge_density_overlap) { + return absl::OkStatus(); + } + + parlay::sequence cluster_ids_overlap = parlay::sequence(graph.Graph()->n); + parlay::parallel_for(0, clustering.size(), [&](size_t i){ + const auto& cluster = clustering[i]; + parlay::parallel_for(0, cluster.size(), [&](size_t j){ + cluster_ids_overlap[cluster[j]] = i; + }); + }); + + std::size_t n = graph.Graph()->n; + auto result = std::vector(clustering.size()); + + if(clustering.size()==1){ + result[0] = (static_cast(graph.Graph()->m)) / (static_cast(n)*(n-1)); + }else{ + for(size_t i = 0; i < clustering.size(); i++) { + if (clustering[i].size() == 1){ + result[i] = 0; + } + else{ + const auto& cluster = clustering[i]; + parlay::parallel_for(0, cluster.size(), [&](size_t j){ + cluster_ids_overlap[cluster[j]] = i; + }); + size_t m_subgraph = get_subgraph_num_edges(graph, clustering[i], cluster_ids_overlap); + double m_total = clustering[i].size()*(clustering[i].size()-1); + // std::cout << "m_subgraph" << " " << m_subgraph << std::endl; + // std::cout << "m_total" << " " << m_total << std::endl; + result[i] = (static_cast(m_subgraph)) / (static_cast(m_total)); + } + } + } + auto result_func = [&](std::size_t i) { + return result[i]; + }; + parlay::sequence cluster_sum = parlay::sequence(n, 0); + parlay::sequence cluster_count = parlay::sequence(n, 0); + parlay::parallel_for(0, clustering.size(), [&](size_t i){ + const auto& cluster = clustering[i]; + parlay::parallel_for(0, cluster.size(), [&](size_t j){ + cluster_sum[cluster[j]] += result_func(i); + cluster_count[cluster[j]] += 1; + }); + }); + + double weighted_mean_overlap = 0; + for (int i=0;iset_weighted_edge_density_overlap_mean(weighted_mean_overlap); + + return absl::OkStatus(); +} + // compute the triangle density of each cluster // triangle density is the number of triangles divided by the number of wedges // if no wedge, density is 0 @@ -129,6 +195,74 @@ inline absl::Status ComputeTriangleDensity(const GbbsGraph& graph, return absl::OkStatus(); } +// compute the triangle density of each cluster with overlapping clusters +// triangle density is the number of triangles divided by the number of wedges +// if no wedge, density is 0 +inline absl::Status ComputeTriangleDensityOverlap(const GbbsGraph& graph, + const InMemoryClusterer::Clustering& clustering, ClusteringStatistics* clustering_stats, + const parlay::sequence& cluster_ids, const ClusteringStatsConfig& clustering_stats_config) { + const bool compute_triangle_density_overlap = clustering_stats_config.compute_triangle_density_overlap(); + if (!compute_triangle_density_overlap) { + return absl::OkStatus(); + } + + parlay::sequence cluster_ids_overlap = parlay::sequence(graph.Graph()->n); + parlay::parallel_for(0, clustering.size(), [&](size_t i){ + const auto& cluster = clustering[i]; + parlay::parallel_for(0, cluster.size(), [&](size_t j){ + cluster_ids_overlap[cluster[j]] = i; + }); + }); + + std::size_t n = graph.Graph()->n; + auto result = std::vector(clustering.size()); + auto f = [&] (gbbs::uintE u, gbbs::uintE v, gbbs::uintE w) { }; + + //even if clustering.size()==1, we need to get the subgraph because could not match 'symmetric_graph' against 'symmetric_ptr_graph' + for(size_t i = 0; i < clustering.size(); i++) { + const auto& cluster = clustering[i]; + parlay::parallel_for(0, cluster.size(), [&](size_t j){ + cluster_ids_overlap[cluster[j]] = i; + }); + auto G = get_subgraph(graph, clustering[i], cluster_ids_overlap); //have to use unweighted graph, otherwise result is wrong + size_t num_wedges = get_num_wedges(&G); + if(num_wedges == 0){ + result[i] = 0; + }else{ + size_t num_tri = 0; + if (G.num_edges() >= 3 && G.num_vertices() >= 3){ + num_tri = gbbs::Triangle_degree_ordering(G, f); + } + result[i] = (static_cast(num_tri)) / (static_cast(num_wedges)); + } + } + // for(double l:result) std::cout << l << std::endl; + auto result_func = [&](std::size_t i) { + return result[i]; + }; + + parlay::sequence cluster_sum = parlay::sequence(n, 0); + parlay::sequence cluster_count = parlay::sequence(n, 0); + parlay::parallel_for(0, clustering.size(), [&](size_t i){ + const auto& cluster = clustering[i]; + parlay::parallel_for(0, cluster.size(), [&](size_t j){ + cluster_sum[cluster[j]] += result_func(i); + cluster_count[cluster[j]] += 1; + }); + }); + + double weighted_mean_overlap = 0; + for (int i=0;iset_weighted_triangle_density_overlap_mean(weighted_mean_overlap); + + return absl::OkStatus(); +} + } // namespace research_graph::in_memory