Skip to content

Commit cad412d

Browse files
committed
fixed evil read_edgeelist bug
1 parent 0ea9160 commit cad412d

File tree

3 files changed

+116
-41
lines changed

3 files changed

+116
-41
lines changed

TODO.txt

Lines changed: 2 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,7 @@
11
TODO:
22

3+
Add GGVec negative_lr_ratio parameter
4+
35
Harden n2v walks to dead end nodes like in regular random walks
46

57
self.names EVERYWHERE!
@@ -27,7 +29,6 @@ Run grid searches, write paper
2729

2830
KarateClub graphs tend to perform poorly for 1st order methods
2931

30-
3132
### This goes along with GraRep recommendations
3233

3334
in BioNEV : "A large value for link prediction tasks (e.g. 3, 4);a small value for node classification tasks (e.g.1, 2)" (p.9)
@@ -54,8 +55,6 @@ Embed Twitter
5455

5556
-------------
5657

57-
Finish Up node names bullshit
58-
5958
mmap support
6059
read_edgelist --> mmaped src/dst/weights
6160
Should be a different class (mmap_graph?)

csrgraph/graph.py

Lines changed: 59 additions & 38 deletions
Original file line numberDiff line numberDiff line change
@@ -21,7 +21,7 @@
2121
from csrgraph import methods, random_walks
2222
from csrgraph import ggvec, glove, grarep
2323

24-
UINT32_MAX = (2**32)-1
24+
UINT32_MAX = (2**32) - 1
2525

2626
class csrgraph():
2727
"""
@@ -105,7 +105,7 @@ def __init__(self, data, nodenames=None, copy=True, threads=0):
105105
if nodenames is not None:
106106
self.names = pd.Series(nodenames)
107107
else:
108-
self.names = None
108+
self.names = pd.Series(np.arange(self.nnodes))
109109
# Bounds check once here otherwise there be dragons later
110110
max_idx = np.max(self.dst)
111111
if self.nnodes < max_idx:
@@ -137,20 +137,20 @@ def set_threads(self, threads):
137137
_src_multiply.recompile()
138138
_dst_multiply.recompile()
139139

140-
141140
def __getitem__(self, node):
142141
"""
143-
Bracket operator
144-
145-
Gets names of neighbor nodes
142+
[] operator
143+
like networkX, gets names of neighbor nodes
146144
"""
147-
if self.names is not None:
148-
node_id = self.names[self.names == node].index[0]
149-
else:
150-
node_id = node
151-
edges = self.dst[self.src[node_id]:
152-
self.src[node_id+1]]
153-
return self.names[edges].values
145+
# Get node ID from names array
146+
# This is O(n) by design -- we more often get names from IDs
147+
# than we get IDs from names and we don't want to hold 2 maps
148+
# TODO : replace names with a pd.Index and use get_loc
149+
node_id = self.names[self.names == node].index[0]
150+
edges = self.dst[
151+
self.src[node_id] : self.src[node_id+1]
152+
]
153+
return self.names.iloc[edges].values
154154

155155
def nodes(self):
156156
"""
@@ -173,9 +173,12 @@ def normalize(self, return_self=True):
173173
"""
174174
new_weights = _row_norm(self.weights, self.src)
175175
if return_self:
176-
self.weights = new_weights
177-
if hasattr(self, 'mat'):
178-
self.mat=sparse.csr_matrix((self.weights, self.dst, self.src))
176+
self.mat = sparse.csr_matrix((new_weights, self.dst, self.src))
177+
# Point objects to the correct places
178+
self.weights = self.mat.data
179+
self.src = self.mat.indptr
180+
self.dst = self.mat.indices
181+
gc.collect()
179182
return self
180183
else:
181184
return csrgraph(sparse.csr_matrix(
@@ -458,12 +461,25 @@ def random_walk_resample(self, walklen=4, epochs=30):
458461
#
459462
#
460463

461-
def read_edgelist(f, directed=True, sep="\t", header=None, **readcsvkwargs):
464+
def read_edgelist(f, directed=True, sep=r"\s+", header=None, **readcsvkwargs):
462465
"""
463-
Creates a csrgraph from an edgelist
466+
Creates a csrgraph from an edgelist.
467+
468+
The edgelist should be in the form
469+
[source destination]
470+
or
471+
[source destination edge_weight]
472+
473+
The first column needs to be the source, the second the destination.
474+
If there is a third column it's assumed to be edge weights.
475+
476+
Otherwise, all arguments from pandas.read_csv can be used to read the file.
464477
465478
f : str
466479
Filename to read
480+
directed : bool
481+
Whether the graph is directed or undirected.
482+
All csrgraphs are directed, undirected graphs simply add "return edges"
467483
sep : str
468484
CSV-style separator. Eg. Use "," if comma separated
469485
header : int or None
@@ -476,6 +492,7 @@ def read_edgelist(f, directed=True, sep="\t", header=None, **readcsvkwargs):
476492
elist = pd.read_csv(f, sep=sep, header=header, **readcsvkwargs)
477493
if len(elist.columns) == 2:
478494
elist.columns = ['src', 'dst']
495+
elist['weight'] = np.ones(elist.shape[0])
479496
elif len(elist.columns) == 3:
480497
elist.columns = ['src', 'dst', 'weight']
481498
else:
@@ -490,38 +507,42 @@ def read_edgelist(f, directed=True, sep="\t", header=None, **readcsvkwargs):
490507
allnodes = list(
491508
set(elist.src.unique())
492509
.union(set(elist.dst.unique())))
493-
# This factors all the unique nodes to unique IDs
510+
# Factor all nodes to unique IDs
494511
names = (
495-
np.array(
496512
pd.Series(allnodes).astype('category')
497513
.cat.categories
498-
))
499-
name_dict = dict(zip(names,
500-
np.arange(names.shape[0])))
501-
src = np.array(elist.src.map(name_dict), dtype=np.uint32)
502-
dst = np.array(elist.dst.map(name_dict), dtype=np.uint32)
514+
)
503515
nnodes = names.shape[0]
504-
#
505-
# TODO: test weighed input graphs here more!!!
506-
# test int weights, float weights, etc.
507-
#
508-
if 'weight' in elist.columns:
509-
weights = elist.weight.to_numpy()
516+
# Get the input data type
517+
if nnodes > UINT32_MAX:
518+
dtype = np.uint64
510519
else:
511-
weights = np.ones(dst.shape[0])
520+
dtype = np.uint32
521+
name_dict = dict(zip(names,
522+
np.arange(names.shape[0], dtype=dtype)))
523+
elist.src = elist.src.map(name_dict)
524+
elist.dst = elist.dst.map(name_dict)
512525
# clean up temp data
513-
elist = None
514526
allnodes = None
515527
name_dict = None
516528
gc.collect()
517529
# If undirected graph, append edgelist to reversed self
518530
if not directed:
519-
src = np.concatenate([src, dst])
520-
# since we overwrote src, we pick original one from dst's shape
521-
dst = np.concatenate([dst, src[:-dst.shape[0]]])
522-
weights = np.concatenate([weights, weights])
531+
other_df = elist.copy()
532+
other_df.columns = ['dst', 'src', 'weight']
533+
elist = pd.concat([elist, other_df])
534+
other_df = None
523535
gc.collect()
536+
# Need to sort by src for _edgelist_to_graph
537+
elist = elist.sort_values(by='src')
538+
# extract numpy arrays and clear memory
539+
src = elist.src.to_numpy()
540+
dst = elist.dst.to_numpy()
541+
weight = elist.weight.to_numpy()
542+
elist = None
543+
gc.collect()
524544
G = methods._edgelist_to_graph(
525-
src, dst, weights, nnodes, nodenames=names
545+
src, dst, weight,
546+
nnodes, nodenames=names
526547
)
527548
return G

tests/test_graph.py

Lines changed: 55 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,9 +1,11 @@
1+
import io
12
import networkx as nx
23
import numpy as np
34
import pandas as pd
45
import random
56
from scipy import sparse
67
from sklearn import cluster, manifold, metrics
8+
import string
79
import unittest
810
import warnings
911

@@ -230,6 +232,59 @@ def test_karate(self):
230232
# Only those edges are present
231233
self.assertTrue(m.sum() == 154)
232234

235+
def test_string_karate(self):
236+
N_NODES = 35
237+
STR_LEN = 10
238+
fname = "./data/karate_edges.txt"
239+
df = pd.read_csv(fname, sep="\t", header=None)
240+
# string node names for each node ID
241+
new_names = [
242+
''.join(random.choice(string.ascii_uppercase)
243+
for _ in range(STR_LEN))
244+
for i in range(N_NODES)
245+
]
246+
# Map node ID -> new node name
247+
name_dict = dict(zip(np.arange(N_NODES), new_names))
248+
for c in df.columns:
249+
df[c] = df[c].map(name_dict)
250+
# Pass this new data to read_edgelist
251+
data = io.StringIO(df.to_csv(index=False, header=False))
252+
G = cg.read_edgelist(data, sep=',')
253+
# re-read original graph
254+
df2 = pd.read_csv(fname, sep="\t", header=None)
255+
# re-map IDs to string node names
256+
for c in df2.columns:
257+
df2[c] = df2[c].map(name_dict)
258+
df2.columns = ['src', 'dst']
259+
for i in range(len(df2)):
260+
s = df2.iloc[i].src
261+
d = df2.iloc[i].dst
262+
# addressing graph by __getitem__ with str
263+
# should return list of str node names
264+
self.assertTrue(d in G[s])
265+
# Only those edges are present
266+
m = G.mat.todense()
267+
self.assertTrue(m.sum() == 154)
268+
269+
def test_float_weights_reading(self):
270+
fname = "./data/karate_edges.txt"
271+
df = pd.read_csv(fname, sep="\t", header=None)
272+
df['weights'] = np.random.rand(df.shape[0])
273+
data = io.StringIO(df.to_csv(index=False, header=False))
274+
G = cg.read_edgelist(data, sep=',')
275+
self.assertTrue((G.weights < 1).all())
276+
self.assertTrue((G.weights > 0).all())
277+
278+
def test_int_weights_reading(self):
279+
WEIGHT_VALUE = 5
280+
fname = "./data/karate_edges.txt"
281+
df = pd.read_csv(fname, sep="\t", header=None)
282+
df['weights'] = np.ones(df.shape[0]) * WEIGHT_VALUE
283+
data = io.StringIO(df.to_csv(index=False, header=False))
284+
G = cg.read_edgelist(data, sep=',')
285+
self.assertTrue((G.weights == WEIGHT_VALUE).all())
286+
self.assertTrue((G.weights == WEIGHT_VALUE).all())
287+
233288
def test_largenumbererror(self):
234289
fname = "./data/largenumbererror.csv"
235290
G = cg.read_edgelist(fname, sep=',')

0 commit comments

Comments
 (0)