fixed evil read_edgeelist bug

VHRanger · VHRanger · commit cad412d87485 · 2020-12-18T20:32:30.000-05:00
diff --git a/TODO.txt b/TODO.txt
@@ -1,5 +1,7 @@
 TODO:
 
+Add GGVec negative_lr_ratio parameter
+
 Harden n2v walks to dead end nodes like in regular random walks
 
 self.names EVERYWHERE!
@@ -27,7 +29,6 @@ Run grid searches, write paper
 
 	KarateClub graphs tend to perform poorly for 1st order methods
 
-
 	### This goes along with GraRep recommendations
 
 	in BioNEV : "A large value for link prediction tasks (e.g. 3, 4);a small value for node classification tasks (e.g.1, 2)" (p.9)
@@ -54,8 +55,6 @@ Embed Twitter
 
 -------------
 
-Finish Up node names bullshit
-
 mmap support
 	read_edgelist --> mmaped src/dst/weights
 	Should be a different class (mmap_graph?)
diff --git a/csrgraph/graph.py b/csrgraph/graph.py
@@ -21,7 +21,7 @@
 from csrgraph import methods, random_walks
 from csrgraph import ggvec, glove, grarep
 
-UINT32_MAX = (2**32)-1
+UINT32_MAX = (2**32) - 1
 
 class csrgraph():
     """
@@ -105,7 +105,7 @@ def __init__(self, data, nodenames=None, copy=True, threads=0):
         if nodenames is not None:
             self.names = pd.Series(nodenames)
         else:
-            self.names = None
+            self.names = pd.Series(np.arange(self.nnodes))
         # Bounds check once here otherwise there be dragons later
         max_idx = np.max(self.dst)
         if self.nnodes < max_idx:
@@ -137,20 +137,20 @@ def set_threads(self, threads):
             _src_multiply.recompile()
             _dst_multiply.recompile()
 
-
     def __getitem__(self, node):
         """
-        Bracket operator
-
-        Gets names of neighbor nodes
+        [] operator
+        like networkX, gets names of neighbor nodes
         """
-        if self.names is not None:
-            node_id = self.names[self.names == node].index[0]
-        else:
-            node_id = node
-        edges = self.dst[self.src[node_id]:
-                         self.src[node_id+1]]
-        return self.names[edges].values
+        # Get node ID from names array
+        # This is O(n) by design -- we more often get names from IDs
+        #    than we get IDs from names and we don't want to hold 2 maps
+        # TODO : replace names with a pd.Index and use get_loc
+        node_id = self.names[self.names == node].index[0]
+        edges = self.dst[
+            self.src[node_id] : self.src[node_id+1]
+        ]
+        return self.names.iloc[edges].values
 
     def nodes(self):
         """
@@ -173,9 +173,12 @@ def normalize(self, return_self=True):
         """
         new_weights = _row_norm(self.weights, self.src)
         if return_self:
-            self.weights = new_weights
-            if hasattr(self, 'mat'):
-                self.mat=sparse.csr_matrix((self.weights, self.dst, self.src))
+            self.mat = sparse.csr_matrix((new_weights, self.dst, self.src))
+            # Point objects to the correct places
+            self.weights = self.mat.data
+            self.src = self.mat.indptr
+            self.dst = self.mat.indices
+            gc.collect()
             return self
         else:
             return csrgraph(sparse.csr_matrix(
@@ -458,12 +461,25 @@ def random_walk_resample(self, walklen=4, epochs=30):
     #
     #
 
-def read_edgelist(f, directed=True, sep="\t", header=None, **readcsvkwargs):
+def read_edgelist(f, directed=True, sep=r"\s+", header=None, **readcsvkwargs):
     """
-    Creates a csrgraph from an edgelist
+    Creates a csrgraph from an edgelist.
+
+    The edgelist should be in the form 
+       [source  destination]
+        or 
+       [source  destination  edge_weight]
+
+    The first column needs to be the source, the second the destination.
+    If there is a third column it's assumed to be edge weights.
+
+    Otherwise, all arguments from pandas.read_csv can be used to read the file.
 
     f : str
         Filename to read
+    directed : bool
+        Whether the graph is directed or undirected.
+        All csrgraphs are directed, undirected graphs simply add "return edges"
     sep : str
         CSV-style separator. Eg. Use "," if comma separated
     header : int or None
@@ -476,6 +492,7 @@ def read_edgelist(f, directed=True, sep="\t", header=None, **readcsvkwargs):
     elist = pd.read_csv(f, sep=sep, header=header, **readcsvkwargs)
     if len(elist.columns) == 2:
         elist.columns = ['src', 'dst']
+        elist['weight'] = np.ones(elist.shape[0])
     elif len(elist.columns) == 3:
         elist.columns = ['src', 'dst', 'weight']
     else: 
@@ -490,38 +507,42 @@ def read_edgelist(f, directed=True, sep="\t", header=None, **readcsvkwargs):
     allnodes = list(
         set(elist.src.unique())
         .union(set(elist.dst.unique())))
-    # This factors all the unique nodes to unique IDs
+    # Factor all nodes to unique IDs
     names = (
-        np.array(
         pd.Series(allnodes).astype('category')
         .cat.categories
-    ))
-    name_dict = dict(zip(names, 
-                         np.arange(names.shape[0])))
-    src = np.array(elist.src.map(name_dict), dtype=np.uint32)
-    dst = np.array(elist.dst.map(name_dict), dtype=np.uint32)
+    )
     nnodes = names.shape[0]
-    #
-    # TODO: test weighed input graphs here more!!!
-    #       test int weights, float weights, etc.
-    #
-    if 'weight' in elist.columns:
-        weights = elist.weight.to_numpy()
+    # Get the input data type
+    if nnodes > UINT32_MAX:
+        dtype = np.uint64
     else:
-        weights = np.ones(dst.shape[0])
+        dtype = np.uint32
+    name_dict = dict(zip(names,
+                         np.arange(names.shape[0], dtype=dtype)))
+    elist.src = elist.src.map(name_dict)
+    elist.dst = elist.dst.map(name_dict)
     # clean up temp data
-    elist = None
     allnodes = None
     name_dict = None
     gc.collect()
     # If undirected graph, append edgelist to reversed self
     if not directed:
-        src = np.concatenate([src, dst])
-        # since we overwrote src, we pick original one from dst's shape
-        dst = np.concatenate([dst, src[:-dst.shape[0]]])
-        weights = np.concatenate([weights, weights])
+        other_df = elist.copy()
+        other_df.columns = ['dst', 'src', 'weight']
+        elist = pd.concat([elist, other_df])
+        other_df = None
         gc.collect()
+    # Need to sort by src for _edgelist_to_graph
+    elist = elist.sort_values(by='src')
+    # extract numpy arrays and clear memory
+    src = elist.src.to_numpy()
+    dst = elist.dst.to_numpy()
+    weight = elist.weight.to_numpy()
+    elist = None
+    gc.collect()
     G = methods._edgelist_to_graph(
-        src, dst, weights, nnodes, nodenames=names
+        src, dst, weight,
+        nnodes, nodenames=names
     )
     return G
diff --git a/tests/test_graph.py b/tests/test_graph.py
@@ -1,9 +1,11 @@
+import io
 import networkx as nx
 import numpy as np
 import pandas as pd
 import random
 from scipy import sparse
 from sklearn import cluster, manifold, metrics
+import string
 import unittest
 import warnings
 
@@ -230,6 +232,59 @@ def test_karate(self):
         # Only those edges are present
         self.assertTrue(m.sum() == 154)
 
+    def test_string_karate(self):
+        N_NODES = 35
+        STR_LEN = 10
+        fname = "./data/karate_edges.txt"
+        df = pd.read_csv(fname, sep="\t", header=None)
+        # string node names for each node ID
+        new_names = [
+            ''.join(random.choice(string.ascii_uppercase) 
+                    for _ in range(STR_LEN))
+            for i in range(N_NODES)
+        ]
+        # Map node ID -> new node name
+        name_dict = dict(zip(np.arange(N_NODES), new_names))
+        for c in df.columns:
+            df[c] = df[c].map(name_dict)
+        # Pass this new data to read_edgelist
+        data = io.StringIO(df.to_csv(index=False, header=False))
+        G = cg.read_edgelist(data, sep=',')
+        # re-read original graph
+        df2 = pd.read_csv(fname, sep="\t", header=None)
+        # re-map IDs to string node names
+        for c in df2.columns:
+            df2[c] = df2[c].map(name_dict)
+        df2.columns = ['src', 'dst']
+        for i in range(len(df2)):
+            s = df2.iloc[i].src
+            d = df2.iloc[i].dst
+            # addressing graph by __getitem__ with str
+            # should return list of str node names
+            self.assertTrue(d in G[s])
+        # Only those edges are present
+        m = G.mat.todense()
+        self.assertTrue(m.sum() == 154)
+
+    def test_float_weights_reading(self):
+        fname = "./data/karate_edges.txt"
+        df = pd.read_csv(fname, sep="\t", header=None)
+        df['weights'] = np.random.rand(df.shape[0])
+        data = io.StringIO(df.to_csv(index=False, header=False))
+        G = cg.read_edgelist(data, sep=',')
+        self.assertTrue((G.weights < 1).all())
+        self.assertTrue((G.weights > 0).all())
+
+    def test_int_weights_reading(self):
+        WEIGHT_VALUE = 5
+        fname = "./data/karate_edges.txt"
+        df = pd.read_csv(fname, sep="\t", header=None)
+        df['weights'] = np.ones(df.shape[0]) * WEIGHT_VALUE
+        data = io.StringIO(df.to_csv(index=False, header=False))
+        G = cg.read_edgelist(data, sep=',')
+        self.assertTrue((G.weights == WEIGHT_VALUE).all())
+        self.assertTrue((G.weights == WEIGHT_VALUE).all())
+
     def test_largenumbererror(self):
         fname = "./data/largenumbererror.csv"
         G = cg.read_edgelist(fname, sep=',')