stash changes

zhanglei1949 · zhanglei1949 · commit d9303ef64ed3 · 2024-12-23T16:42:18.000+08:00
Committed-by: xiaolei.zl@alibaba-inc.com from Dev container
diff --git a/flex/interactive/examples/email/graph.yaml b/flex/interactive/examples/email/graph.yaml
@@ -0,0 +1,22 @@
+name: modern_graph # then must have a modern dir under ${data} directory
+version: v0.1
+store_type: mutable_csr  # v6d, groot, gart
+description: A graph with 2 vertex types and 2 edge types
+schema:
+  vertex_types:
+    - type_id: 0
+      type_name: vertex
+      properties:
+        - property_id: 0
+          property_name: id
+          property_type:
+            primitive_type: DT_SIGNED_INT64
+      primary_keys:
+        - id
+  edge_types:
+    - type_id: 0
+      type_name: edge
+      vertex_type_pair_relations:
+        - source_vertex: vertex
+          destination_vertex: vertex
+          relation: MANY_TO_MANY
diff --git a/flex/interactive/examples/email/import.yaml b/flex/interactive/examples/email/import.yaml
@@ -0,0 +1,31 @@
+graph: modern_graph
+loading_config:
+  data_source:
+    scheme: file  # file, oss, s3, hdfs; only file is supported now
+    #location: /home/graphscope/gs_interactive_default_graph/
+  import_option: init # append, overwrite, only init is supported now
+  format:
+    type: csv
+    metadata:
+      delimiter: ","  # other loading configuration places here
+      header_row: false # whether to use the first row as the header
+      quoting: false
+      quote_char: '"'
+      double_quote: true
+      escape_char: '\'
+      escaping: false
+      block_size: 4MB
+      batch_reader: true
+      null_values: [""]
+
+vertex_mappings:
+  - type_name: vertex  # must align with the schema
+    inputs:
+      - vertices.csv
+edge_mappings:
+  - type_triplet:
+      edge: edge
+      source_vertex:  vertex
+      destination_vertex:  vertex
+    inputs:
+      - edges.csv
diff --git a/flex/interactive/examples/email/preprocess.py b/flex/interactive/examples/email/preprocess.py
@@ -0,0 +1,43 @@
+#!/bin/python3
+
+import os
+import sys
+
+if __name__ == "__main__":
+    # Expect a arg of file path
+    if len(sys.argv) != 4:
+        print("Usage: python3 preprocess.py <file> <vertex_file> <edge_file>")
+        sys.exit(1)
+    # Get the file path
+    file_path = sys.argv[1]
+    vertex_file_path = sys.argv[2]
+    edge_file_path = sys.argv[3]
+    vertices = set()
+    edges = []
+    # open the file and iterate over the lines
+    with open(file_path, "r") as file:
+        for line in file:
+            # if line starts with #, skip it
+            if line.startswith("#"):
+                continue
+            # split the line by space
+            parts = line.split()
+            # if contains two parts, it is a edge
+            if len(parts) == 2:
+                int_parts = [int(part) for part in parts]
+                # add the vertices to the set
+                vertices.add(int_parts[0])
+                vertices.add(int_parts[1])
+                edges.append(parts)
+    # write vertices to vertices.csv, and edges to edges.csv
+    # sort vertices
+    vertices = sorted(vertices)
+    with open(vertex_file_path, "w") as file:
+        for vertex in vertices:
+            file.write(str(vertex) + "\n")
+    with open(edge_file_path, "w") as file:
+        for edge in edges:
+            file.write(edge[0] + "," + edge[1] + "\n")
+                
+            
+        
diff --git a/flex/tests/leiden/sort_and_compare.py b/flex/tests/leiden/sort_and_compare.py
@@ -0,0 +1,77 @@
+#!/bin/python3
+import sys
+import csv
+
+
+if __name__ == "__main__":
+    # expect two args, the actual file and expected file
+    
+    if len(sys.argv) != 3:
+        print("Usage: sort_and_compare.py <actual> <expected>")
+        sys.exit(1)
+    
+    actual_file = sys.argv[1]
+    expected_file = sys.argv[2]
+    # Each csv files contains two columns, the first column is the cluster id, and second column is the vertex id.
+    # Read the actual file and expected file into two lists
+    actual = {}
+    actual_cluster_map = {}
+    expected = {}
+    expected_cluster_map = {}
+    with open(actual_file, 'r') as f:
+        reader = csv.reader(f,delimiter=' ')
+        for row in reader:
+            vertex_id = int(row[0])
+            cluster_id = int(row[1])
+            actual[vertex_id] = cluster_id
+            if actual_cluster_map.get(cluster_id) is None:
+                actual_cluster_map[cluster_id] = 0
+            actual_cluster_map[cluster_id] += 1
+    print("Finished reading actual file, vertices count: ", len(actual), " num clusters ", len(actual_cluster_map), " min cluster size, max cluster size: ", min(actual_cluster_map.values()), max(actual_cluster_map.values()))
+
+    with open(expected_file, 'r') as f:
+        reader = csv.reader(f, delimiter='\t')
+        for row in reader:
+            vertex_id = int(row[0])
+            cluster_id = int(row[1])
+            expected[vertex_id] = cluster_id
+            if expected_cluster_map.get(cluster_id) is None:
+                expected_cluster_map[cluster_id] = 0
+            expected_cluster_map[cluster_id] += 1
+
+        # cur_cluster = 0
+        # for line in f:
+        #     # each line is like Category:Buprestoidea; 301 302 303 304 305 306 30
+        #     # got all numbers after; and put them into the same cluster
+        #     vertex_ids = line.split(";")[1].split()
+        #     for vertex_id in vertex_ids:
+        #         expected[int(vertex_id)] = cur_cluster
+        #     expected_cluster_map[cur_cluster] = len(vertex_ids)
+        #     cur_cluster += 1
+            
+    print("Finished reading expected file, vertices count: ", len(expected), " num clusters", len(expected_cluster_map),  " min cluster size, max cluster size: ", min(expected_cluster_map.values()), max(expected_cluster_map.values()))
+    actual_cluster_id_to_expected_cluster_id = {}
+    cnt = 0
+    for vertex_id, cluster_id in actual.items():
+        if vertex_id in expected:
+            if actual_cluster_id_to_expected_cluster_id.get(cluster_id) is None:
+                actual_cluster_id_to_expected_cluster_id[cluster_id] = expected[vertex_id]
+            if actual_cluster_id_to_expected_cluster_id[cluster_id] != expected[vertex_id]:
+                cnt += 1
+                # print("Cluster id mismatch for vertex id: ", vertex_id, " actual cluster id: ", cluster_id, " expected cluster id: ", expected[vertex_id])
+        else:
+            print("Vertex id not found in expected file: ", vertex_id)
+            
+    # count the number of cluster with size >2
+    expected_cluster_ids_appeared_in_actual = set()
+    cnt_2 = 0
+    for cluster_id, size in actual_cluster_map.items():
+        if size >= 2:
+            cnt_2 += 1
+            expected_cluster_id =  actual_cluster_id_to_expected_cluster_id.get(cluster_id)
+            expected_cluster_ids_appeared_in_actual.add(expected_cluster_id)
+    print("Number expect cluster with size >= 2 appeared in actual: ", cnt_2, " out of ", len(expected_cluster_map))
+            
+    print("Total cluster id mismatch: ", cnt, " out of ", len(expected), "expected cluster number: ", len(actual_cluster_id_to_expected_cluster_id), " actual cluster number: ", len(expected_cluster_map))
+    
+         
diff --git a/flex/tests/leiden/test.cc b/flex/tests/leiden/test.cc
@@ -17,7 +17,11 @@
 #include <stdio.h>
 #include "include/CPMVertexPartition.h"
 #include "include/GraphHelper.h"
+#include "include/ModularityVertexPartition.h"
 #include "include/Optimiser.h"
+#include "include/RBConfigurationVertexPartition.h"
+#include "include/RBERVertexPartition.h"
+#include "include/SignificanceVertexPartition.h"
 
 #include <glog/logging.h>
 #include "flex/engines/graph_db/database/graph_db.h"
@@ -33,28 +37,33 @@ void graph_db_to_igraph(igraph_t* g, gs::GraphDBSession& sess) {
   igraph_vector_int_t edges;
   igraph_vector_int_init(&edges, 0);
   auto& frag = sess.graph();
+  size_t edges_cnt = 0;
   for (gs::vid_t v = 0; v < frag.vertex_num(0); ++v) {
     auto oe = frag.get_outgoing_edges_raw(0, v, 0, 0);
     while (oe->is_valid()) {
       igraph_vector_int_push_back(&edges, v);
       igraph_vector_int_push_back(&edges, oe->get_neighbor());
       oe->next();
+      edges_cnt++;
     }
   }
+  LOG(INFO) << "Edges count: " << edges_cnt;
 
-  igraph_create(g, &edges, 0, true);
+  igraph_create(g, &edges, frag.vertex_num(0), true);
   igraph_vector_int_destroy(&edges);
 }
 
 int main(int argc, char** argv) {
   // igraph_t g;
   // igraph_famous(&g, "Zachary");
-  if (argc != 3) {
-    LOG(ERROR) << "Usage: " << argv[0] << "<schema_path> <db_path>";
+  if (argc != 4) {
+    LOG(ERROR) << "Usage: " << argv[0]
+               << "<schema_path> <db_path> <output_path>";
     return 1;
   }
   std::string schema_path = argv[1];
   std::string db_path = argv[2];
+  std::string output_path = argv[3];
 
   igraph_t g;
 
@@ -85,12 +94,22 @@ int main(int argc, char** argv) {
 
   o.optimise_partition(&part);
 
-  cout << "Node\tCommunity" << endl;
+  // write to file
+  // /workspaces/GraphScope/flex/interactive/examples/wiki/leiden_out open file
+  FILE* f = fopen(output_path.c_str(), "w");
+  if (f == NULL) {
+    printf("Error opening file!\n");
+    exit(1);
+  }
+  // cout << "Node\tCommunity" << endl;
+  cout << "Number of communities: " << part.n_communities() << endl;
   auto txn = sess.GetReadTransaction();
   for (size_t i = 0; i < graph.vcount(); i++) {
-    cout << part.membership(i) << "," << txn.GetVertexId(0, i).to_string()
-         << endl;
+    // cout << i << "\t" << part.membership[i] << endl;
+    fprintf(f, "%zu\t%zu\n", txn.GetVertexId(0, i).AsInt64(),
+            part.membership()[i]);
   }
+  fclose(f);
 
   igraph_destroy(&g);