dm-graphs/dm_graphs.py at master · evanmiltenburg/dm-graphs · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
import networkx as nx
import community
from itertools import combinations

################################################################################
#   Utilities

def invert_weights(G):
    """Invert weights. This is useful when weights are Cosine DISTANCE rather than
    Cosine SIMILARITY. Be sure to check whether the weights positively correspond
    to connection strength, or else Gephi will generate awkward pictures."""
    NG = nx.Graph()
    NG.add_weighted_edges_from((a,b,1-G[a][b]['weight']) for a,b in G.edges())
    return NG

def remove_weights(G):
    """Remove weights. This is useful if you want to write out the graph without
    any weighting information. For example, if you don't want Gephi to make edges
    thicker."""
    NG = nx.Graph()
    NG.add_edges_from(G.edges())
    return NG

def add_partition_data(G,partition,name="Part"):
    """Add partition data to the graph. The partition should come in the form of
    a dictionary, with k=node, v=part. Optionally, you can specify a name for the
    partitioning."""
    NG = G.copy()
    for node in NG.nodes():
        NG.node[node]['Part'] = partition[node]
    return NG

def rank_dict(G,node):
    "Returns a dictionary of words connected to the node with their ranking."
    ranked          = dict()
    neighbors       = G.neighbors(node)
    num_neighbors   = float(len(neighbors))
    for rank,word in enumerate( sorted(neighbors,
                                key=lambda x:G[node][x]['weight'])):
        ranked[word] = (rank+1)/num_neighbors
    return ranked

def rank_reweight(G):
    "Reweight the graph by rank rather than similarity."
    NG  = nx.Graph()
    d   = {node:rank_dict(G,node) for node in G.nodes()}
    def avg_rank(a,b,d):
        return (d[a][b] + d[b][a])/2
    NG.add_weighted_edges_from((a,b,avg_rank(a,b,d)) for a,b in G.edges())
    return NG

def write_functions():
    "See what functions you can use to write out the graph."
    print [f for f in dir(nx) if f.startswith('write')]

################################################################################
#   Analyzing the graph

def main_graph(G):
    "Takes a graph object and returns the largest graph."
    graph_list = list(nx.connected_component_subgraphs(G))
    return sorted(graph_list,   # Main graph is equal to the first element
                  reverse=True, # of the list of isolated graphs, sorted in
                  key=len       # reverse order.
                        )[0]

def graph_analysis(G):
    """Analyze graph. Returns a dictionary with useful data.
    Cannot deal with weights below 0, so all negative weights are set to 0."""
    MG                      = main_graph(G)
    for a,b in MG.edges():
        w = MG[a][b]['weight']
        if w < 0:
            MG[a][b]['weight'] = 0
    partition = community.best_partition(MG)
    return { 'num_clusters':    max(partition.values()),
             'modularity':      community.modularity(partition,MG),
             'size':            len(MG.nodes()),
             'partition':       partition}

def partition_by_group(partition):
    "Get a dictionary with k=partition, v=[words,in,partition]"
    d = {i:[] for i in range(max(partition.values())+1)}
    for w in partition:
        d[partition[w]].append(w)
    return d

################################################################################
#   Reducing the graph

def graph_reduce(G,n=5,theta=0.5,
                    top_n=True,
                    mutual=False,
                    threshold=False,
                    return_only_main=True):
    """Reduce the graph through either (or both) of two similarity measures:
        - An edge can only connect A and B iff B is in A's top-n similar nodes,
        or vice versa.
        - An edge can only connect A and B iff the weight is sufficient."""
    if mutual:
        NG        = nx.Graph()            # generate a new graph
        topn_dict = dict()                # and a dict to keep top-n neighbors
        for node in G:                    # for each node create a sorted list of
            l = sorted( G.neighbors(node),# similar nodes.
                    key=lambda x:G[node][x]['weight'],
                    reverse=True)[:n]     # get first n items
            topn_dict[node] = set(l)      # and add it to the dictionary
        # select the edges that match the criteria.
        to_add = [(a,b) for a,b in G.edges() if     a in topn_dict[b]
                                                and b in topn_dict[a]]
        # and add them to the new graph.
        NG.add_weighted_edges_from((a,b,G[a][b]['weight']) for a,b in to_add)

    # Proceed with the top-n condition, if mutual is False.

    elif top_n:                           # if we are using the top-n measure
        NG = nx.Graph()                   # generate a new graph
        for node in G:                    # and for each node a sorted list of
            l = sorted( G.neighbors(node),# similar nodes.
                    key=lambda x:G[node][x]['weight'],
                    reverse=True)[:n]     # first n items

            # And add the top-n similar edges to the graph!
            NG.add_weighted_edges_from((node,x,G[node][x]['weight']) for x in l)

    # Else just create a copy of G, that we can prune using the threshold measure
    else:
        NG = G.copy()

    # Code for the threshold measure:
    if threshold:                         # if threshold is active, and weights
        for a,b in NG.edges():            # for all connected nodes a,b
            if G[a][b]['weight'] < theta: # if the similarity is too small.
                NG.remove_edge(a,b)       # remove the edge between a and b

    # By default we only return the main graph and ignore isolated graphs:
    if return_only_main:
        return main_graph(NG)
    else:
        return NG


def MST_pathfinder(G):
    """The MST-pathfinder algorithm (Quirin et al. 2008) reduces the graph to the
    unions of all minimal spanning trees."""
    NG    = nx.Graph()
    edges = sorted( ((G[a][b]['weight'],a,b) for a,b in G.edges()),
                        reverse=True) # reversed, because greater similarity = better
    clusters = {node:i for i,node in enumerate(G.nodes())}
    while not edges == []:
        w1,a,b = edges[0]
        l      = []
        # Select edges to be considered this round:
        for w2,u,v in edges:
            if w1 == w2:
                l.append((u,v,w2))
            else:
                break
        # Remaining edges are those not being considered this round:
        edges = edges[len(l):]
        # Only select those edges for which the items are not in the same cluster
        l = [(a,b,c) for a,b,c in l if not clusters[a]==clusters[b]]
        # Add these edges to the graph:
        NG.add_weighted_edges_from(l)
        # Merge the clusters:
        for a,b,w in l:
            cluster_1 = clusters[a]
            cluster_2 = clusters[b]
            clusters = {node:cluster_1 if i==cluster_2 else i
                        for node,i in clusters.iteritems()}
    return NG


def maxmax_transform(G):
    """Implements phase 1 of the MaxMax algorithm by Hope & Keller (2013).
    Returns a directed graph where for all edges (v,u) v is the maximal vertex of u."""

    index = dict()
    NG    = nx.DiGraph()

    def update_edges(item,neighbor,weight):
        "Update the edge index for item."
        l = index.get(item,[])
        if len(l) == 0:
            index[item] = [(neighbor,weight)]
        else:
            current_weight = index[item][0][1]
            if weight > current_weight:
                index[item] = [(neighbor,weight)]
            elif weight == current_weight:
                index[item].append((neighbor,weight))
            else:
                pass # do nothing

    # Go through all edges, and establish maximal vertices for each vertex.
    for u,v in G.edges():
        w = G[u][v]['weight']
        update_edges(u,v,w)
        update_edges(v,u,w)

    # Add edges to the new directed graph.
    NG.add_edges_from((t[0],u) for u,l in index.items() for t in l)

    return NG

def maxmax_clusters(G):
    """Get clusters based on directed MaxMax graph.

    My own twist to the MaxMax algorithm, but still in the MaxMax spirit.

    Hope & Keller do this:
    for node in G.nodes():
        if node is marked ROOT:
            mark children of node NOT-ROOT

    But this means that with this structure, if we start with B and all nodes are
    marked ROOT, then D is NOT-ROOT. This means that F and G will not be marked
    NOT-ROOT:

                 A
               B   C
             D
           F   G

    I just return a list containing all maximal QSC subgraphs."""

    # Function to create clusters. It takes a node and returns a set containing
    # that node and all its successors.
    cluster    = lambda node: {node} | set(nx.bfs_successors(G,node))

    # Get and sort all clusters by size:
    clusters   = sorted([cluster(node) for node in G.nodes()],key=len,reverse=True)

    # Create list to output results:
    results    = []

    # Loop through the ordered set of clusters, starting with the largest.
    # Add cluster to the results iff it is not a subset of a cluster already in
    # the results.
    for c in clusters:
        if not(True in [c.issubset(s_i) for s_i in results]):
            results.append(c)
    return results