2121from csrgraph import methods , random_walks
2222from csrgraph import ggvec , glove , grarep
2323
24- UINT32_MAX = (2 ** 32 )- 1
24+ UINT32_MAX = (2 ** 32 ) - 1
2525
2626class csrgraph ():
2727 """
@@ -105,7 +105,7 @@ def __init__(self, data, nodenames=None, copy=True, threads=0):
105105 if nodenames is not None :
106106 self .names = pd .Series (nodenames )
107107 else :
108- self .names = None
108+ self .names = pd . Series ( np . arange ( self . nnodes ))
109109 # Bounds check once here otherwise there be dragons later
110110 max_idx = np .max (self .dst )
111111 if self .nnodes < max_idx :
@@ -137,20 +137,20 @@ def set_threads(self, threads):
137137 _src_multiply .recompile ()
138138 _dst_multiply .recompile ()
139139
140-
141140 def __getitem__ (self , node ):
142141 """
143- Bracket operator
144-
145- Gets names of neighbor nodes
142+ [] operator
143+ like networkX, gets names of neighbor nodes
146144 """
147- if self .names is not None :
148- node_id = self .names [self .names == node ].index [0 ]
149- else :
150- node_id = node
151- edges = self .dst [self .src [node_id ]:
152- self .src [node_id + 1 ]]
153- return self .names [edges ].values
145+ # Get node ID from names array
146+ # This is O(n) by design -- we more often get names from IDs
147+ # than we get IDs from names and we don't want to hold 2 maps
148+ # TODO : replace names with a pd.Index and use get_loc
149+ node_id = self .names [self .names == node ].index [0 ]
150+ edges = self .dst [
151+ self .src [node_id ] : self .src [node_id + 1 ]
152+ ]
153+ return self .names .iloc [edges ].values
154154
155155 def nodes (self ):
156156 """
@@ -173,9 +173,12 @@ def normalize(self, return_self=True):
173173 """
174174 new_weights = _row_norm (self .weights , self .src )
175175 if return_self :
176- self .weights = new_weights
177- if hasattr (self , 'mat' ):
178- self .mat = sparse .csr_matrix ((self .weights , self .dst , self .src ))
176+ self .mat = sparse .csr_matrix ((new_weights , self .dst , self .src ))
177+ # Point objects to the correct places
178+ self .weights = self .mat .data
179+ self .src = self .mat .indptr
180+ self .dst = self .mat .indices
181+ gc .collect ()
179182 return self
180183 else :
181184 return csrgraph (sparse .csr_matrix (
@@ -458,12 +461,25 @@ def random_walk_resample(self, walklen=4, epochs=30):
458461 #
459462 #
460463
461- def read_edgelist (f , directed = True , sep = " \t " , header = None , ** readcsvkwargs ):
464+ def read_edgelist (f , directed = True , sep = r"\s+ " , header = None , ** readcsvkwargs ):
462465 """
463- Creates a csrgraph from an edgelist
466+ Creates a csrgraph from an edgelist.
467+
468+ The edgelist should be in the form
469+ [source destination]
470+ or
471+ [source destination edge_weight]
472+
473+ The first column needs to be the source, the second the destination.
474+ If there is a third column it's assumed to be edge weights.
475+
476+ Otherwise, all arguments from pandas.read_csv can be used to read the file.
464477
465478 f : str
466479 Filename to read
480+ directed : bool
481+ Whether the graph is directed or undirected.
482+ All csrgraphs are directed, undirected graphs simply add "return edges"
467483 sep : str
468484 CSV-style separator. Eg. Use "," if comma separated
469485 header : int or None
@@ -476,6 +492,7 @@ def read_edgelist(f, directed=True, sep="\t", header=None, **readcsvkwargs):
476492 elist = pd .read_csv (f , sep = sep , header = header , ** readcsvkwargs )
477493 if len (elist .columns ) == 2 :
478494 elist .columns = ['src' , 'dst' ]
495+ elist ['weight' ] = np .ones (elist .shape [0 ])
479496 elif len (elist .columns ) == 3 :
480497 elist .columns = ['src' , 'dst' , 'weight' ]
481498 else :
@@ -490,38 +507,42 @@ def read_edgelist(f, directed=True, sep="\t", header=None, **readcsvkwargs):
490507 allnodes = list (
491508 set (elist .src .unique ())
492509 .union (set (elist .dst .unique ())))
493- # This factors all the unique nodes to unique IDs
510+ # Factor all nodes to unique IDs
494511 names = (
495- np .array (
496512 pd .Series (allnodes ).astype ('category' )
497513 .cat .categories
498- ))
499- name_dict = dict (zip (names ,
500- np .arange (names .shape [0 ])))
501- src = np .array (elist .src .map (name_dict ), dtype = np .uint32 )
502- dst = np .array (elist .dst .map (name_dict ), dtype = np .uint32 )
514+ )
503515 nnodes = names .shape [0 ]
504- #
505- # TODO: test weighed input graphs here more!!!
506- # test int weights, float weights, etc.
507- #
508- if 'weight' in elist .columns :
509- weights = elist .weight .to_numpy ()
516+ # Get the input data type
517+ if nnodes > UINT32_MAX :
518+ dtype = np .uint64
510519 else :
511- weights = np .ones (dst .shape [0 ])
520+ dtype = np .uint32
521+ name_dict = dict (zip (names ,
522+ np .arange (names .shape [0 ], dtype = dtype )))
523+ elist .src = elist .src .map (name_dict )
524+ elist .dst = elist .dst .map (name_dict )
512525 # clean up temp data
513- elist = None
514526 allnodes = None
515527 name_dict = None
516528 gc .collect ()
517529 # If undirected graph, append edgelist to reversed self
518530 if not directed :
519- src = np . concatenate ([ src , dst ] )
520- # since we overwrote src, we pick original one from dst's shape
521- dst = np . concatenate ([ dst , src [: - dst . shape [ 0 ]] ])
522- weights = np . concatenate ([ weights , weights ])
531+ other_df = elist . copy ( )
532+ other_df . columns = [ 'dst' , ' src' , 'weight' ]
533+ elist = pd . concat ([ elist , other_df ])
534+ other_df = None
523535 gc .collect ()
536+ # Need to sort by src for _edgelist_to_graph
537+ elist = elist .sort_values (by = 'src' )
538+ # extract numpy arrays and clear memory
539+ src = elist .src .to_numpy ()
540+ dst = elist .dst .to_numpy ()
541+ weight = elist .weight .to_numpy ()
542+ elist = None
543+ gc .collect ()
524544 G = methods ._edgelist_to_graph (
525- src , dst , weights , nnodes , nodenames = names
545+ src , dst , weight ,
546+ nnodes , nodenames = names
526547 )
527548 return G
0 commit comments