Skip to content

Commit e397193

Browse files
committed
Adding image and an attempt at bash cleaning data.
1 parent e506ed9 commit e397193

File tree

3 files changed

+132
-114
lines changed

3 files changed

+132
-114
lines changed

img/interop_figure.png

975 KB
Loading

src/networkgraph.py

Lines changed: 115 additions & 114 deletions
Original file line numberDiff line numberDiff line change
@@ -1,124 +1,125 @@
1-
import networkx as nx
21
import csv
32
from itertools import combinations
43
import matplotlib.pyplot as plt
4+
import networkx as nx
5+
from networkx.algorithms import bipartite
6+
from sknetwork.data import from_edge_list
7+
from sknetwork.clustering import Louvain
8+
from nxviz import CircosPlot, BasePlot
9+
import nxviz as nv
510
import math
11+
import json
612

713
G = nx.Graph()
814

9-
papers = []
10-
dbs = []
11-
12-
with open('./data/merged_records.csv', 'r', encoding='UTF-8') as terms:
13-
reader = csv.reader(terms)
14-
for i in reader:
15-
dbs.append([j.strip() for j in i])
16-
17-
with open('./data/repohits.csv', 'r', encoding='UTF-8') as file:
18-
reader = csv.reader(file)
19-
for i in reader:
20-
db = [j[0] for j in dbs].index(i[3])
21-
papers.append({'doi': i[0],
22-
'snippet': i[1],
23-
'title': i[2],
24-
'database': dbs[db][1]})
25-
26-
# Remove all papers without a DOI:
27-
papers = [i for i in papers if i.get('doi') or '' != '']
28-
doi_list = [i.get('doi') for i in papers]
29-
doi_set = set(doi_list)
30-
doi_count = {i: doi_list.count(i) for i in doi_set if doi_list.count(i) > 1}
31-
clean_dois = set(doi_count.keys())
32-
33-
clean_papers = [i for i in papers if i.get('doi') in clean_dois]
34-
35-
dataresources = set([i.get('database') for i in papers])
36-
37-
G.add_nodes_from(dataresources)
38-
dois = []
39-
40-
j = 0
41-
42-
for i in clean_papers:
43-
j = j + 1
44-
if i.get('doi') not in [i.get('doi') or '' for i in dois]:
45-
dois.append({'doi': i.get('doi'), 'resource': set(i.get('database'))})
46-
else:
47-
doi_loc = [j.get('doi') for j in dois].index(i.get('doi'))
48-
dois[doi_loc]['resource'].add(i.get('database'))
49-
if j % 1000 == 0:
50-
print(j)
51-
52-
for i in dois:
53-
if len(i.get('resource')) > 1:
54-
combs = list(combinations(i.get('resource'), 2))
55-
for j in combs:
56-
if j[0] != j[1]:
57-
if j in G.edges:
58-
G.edges[j[0], j[1]]['weight'] = G.edges[j]['weight'] + 1
59-
else:
60-
G.add_edge(j[0], j[1], weight = 1)
61-
62-
weights = [math.sqrt(G[u][v]['weight']) for u,v in G.edges()]
63-
64-
subax1 = plt.subplot(111)
65-
nx.draw(G, nx.kamada_kawai_layout(G), with_labels=True, edge_color="tab:red", font_weight='bold')
15+
with open('./data/doi_joined.json') as terms:
16+
graph = json.load(terms)
17+
18+
graph = [i for i in graph if i['doi'] != '']
19+
20+
dois = [i['doi'] for i in graph]
21+
22+
resources = set()
23+
24+
for i in graph:
25+
for j in i['resources']:
26+
resources.add(j)
27+
28+
G.add_nodes_from(dois, bipartite = 0)
29+
G.add_nodes_from(resources, bipartite = 1)
30+
31+
for i in graph:
32+
for j in i['resources']:
33+
G.add_edges_from([(i['doi'], j)])
34+
35+
# We have a bipartite graph.
36+
nx.is_connected(G)
37+
38+
edge_list=[(e[0],e[1], 1) for e in G.edges(data=True)]
39+
bgraph = from_edge_list(edge_list, bipartite=True)
40+
41+
names = bgraph.names
42+
names_row = bgraph.names_row
43+
names_col = bgraph.names_col
44+
biadjacency=bgraph.biadjacency
45+
46+
#Louvain with Barber modularity
47+
louvain = Louvain()
48+
louvain.fit(biadjacency,force_bipartite=True)
49+
labels_row = louvain.labels_row_
50+
labels_col = louvain.labels_col_
51+
52+
#Add the label to the graph
53+
partition={}
54+
for i,n_r in enumerate(names_row):
55+
partition[n_r]=labels_row[i]
56+
for i,n_c in enumerate(names_col):
57+
partition[n_c]=labels_col[i]
58+
59+
nx.set_node_attributes(G, partition, 'community_louvain')
60+
61+
resource_nodes = [node for node in G.nodes() if G._node[node]['bipartite'] == 1]
62+
paper_nodes = [node for node in G.nodes() if G._node[node]['bipartite'] == 0]
63+
64+
resource_centrality = [node for node in nx.bipartite.degree_centrality(G, resource_nodes).items() if not node[0].startswith("1")]
65+
66+
sorted(resource_centrality, key=lambda x: x[1], reverse=True)[:5]
67+
68+
resource_graph = nx.bipartite.projection.projected_graph(G, resource_nodes)
69+
70+
for n, d in resource_graph.nodes(data=True):
71+
resource_graph._node[n]['neighbors_count'] = len(list(resource_graph.neighbors(n)))
72+
73+
options = {"edgecolors": "tab:gray", "node_size": 700, "alpha": 0.7}
74+
label_options = {"ec": "k", "fc": "white", "alpha": 0.7}
75+
76+
pos = nx.spring_layout(resource_graph, seed=3113794652) # positions for all nodes
77+
78+
fig = plt.figure(figsize=(6, 9))
79+
80+
nx.draw_networkx_edges(resource_graph, pos, alpha = 0.1)
81+
nx.draw_networkx_nodes(resource_graph, pos, **options)
82+
nx.draw_networkx_labels(resource_graph, pos, font_size=14, bbox=label_options)
6683
plt.show()
6784

68-
# Betweenness
69-
# remove randomly selected nodes (to make example fast)
70-
# largest connected component
71-
components = nx.connected_components(G)
72-
largest_component = max(components, key=len)
73-
H = G.subgraph(largest_component)
74-
75-
# compute centrality
76-
centrality = nx.betweenness_centrality(G, endpoints=True, weight = 'weight')
77-
78-
# compute community structure
79-
lpc = nx.community.label_propagation_communities(G)
80-
community_index = {n: i for i, com in enumerate(lpc) for n in com}
81-
82-
#### draw graph ####
83-
fig, ax = plt.subplots(figsize=(20, 15))
84-
pos = nx.spring_layout(G, k=0.15, seed=4572321)
85-
node_color = [community_index[n] for n in G]
86-
node_size = [v * 20000 for v in centrality.values()]
87-
nx.draw_networkx(
88-
G,
89-
pos=pos,
90-
with_labels=False,
91-
node_color=node_color,
92-
node_size=node_size,
93-
edge_color="gainsboro",
94-
alpha=0.4,
95-
)
96-
97-
# Title/legend
98-
font = {"color": "k", "fontweight": "bold", "fontsize": 20}
99-
ax.set_title("Gene functional association network (C. elegans)", font)
100-
# Change font color for legend
101-
font["color"] = "r"
102-
103-
ax.text(
104-
0.80,
105-
0.10,
106-
"node color = community structure",
107-
horizontalalignment="center",
108-
transform=ax.transAxes,
109-
fontdict=font,
110-
)
111-
ax.text(
112-
0.80,
113-
0.06,
114-
"node size = betweenness centrality",
115-
horizontalalignment="center",
116-
transform=ax.transAxes,
117-
fontdict=font,
118-
)
119-
120-
# Resize figure for label readability
121-
ax.margins(0.1, 0.05)
122-
fig.tight_layout()
123-
plt.axis("off")
85+
86+
# function to create node colour list
87+
def create_community_node_colors(graph, communities):
88+
number_of_colors = len(communities)
89+
colors = ["#D4FCB1", "#CDC5FC", "#FFC2C4", "#F2D140", "#BCC6C8"][:number_of_colors]
90+
node_colors = []
91+
for node in graph:
92+
current_community_index = 0
93+
for community in communities:
94+
if node in community:
95+
node_colors.append(colors[current_community_index])
96+
break
97+
current_community_index += 1
98+
return node_colors
99+
100+
101+
# function to plot graph with node colouring based on communities
102+
def visualize_communities(graph, communities, i):
103+
node_colors = create_community_node_colors(graph, communities)
104+
modularity = round(nx.community.modularity(graph, communities), 6)
105+
title = f"Community Visualization of {len(communities)} communities with modularity of {modularity}"
106+
pos = nx.spring_layout(graph, k=0.3, iterations=50, seed=2)
107+
plt.subplot(3, 1, i)
108+
plt.title(title)
109+
nx.draw(
110+
graph,
111+
pos=pos,
112+
node_size=1000,
113+
node_color=node_colors,
114+
with_labels=True,
115+
font_size=20,
116+
font_color="black",
117+
)
118+
119+
120+
communities = list(nx.algorithms.community.girvan_newman(resource_graph))
121+
122+
# Plot graph with colouring based on communities
123+
visualize_communities(resource_graph, communities[0], 1)
124+
visualize_communities(resource_graph, communities[3], 2)
124125
plt.show()

src/sed_cleaning.sh

Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,17 @@
1+
#!/bin/bash
2+
3+
# Removing GBIF address lines:
4+
sed -i '/^.*Universitetsparken.*$/d' GBIF.csv
5+
sed -i '/^.*Biotime Biotech.*$/d' BioTIME.csv
6+
sed -i '/^.*Bio[Tt]ime,* Inc.*$/d' BioTIME.csv
7+
sed -i '/^.*[Bb]io[tT]imer.*$/d' BioTIME.csv
8+
sed -i '/^.*[bB]io[tT]imes.*$/d' BioTIME.csv
9+
sed -i '/^.*Alameda.*$/d' BioTIME.csv
10+
sed -i '/^.*[Bb]io[Tt]im[\s,\.].*$/d' BioTIME.csv
11+
sed -i '/^.*Neptune City.*$/d' Neptune.csv
12+
sed -i '/^.*Neptune, New.*$/d' Neptune.csv
13+
sed -i '/^.*Neptune, NJ.*$/d' Neptune.csv
14+
sed -i '/^.*[(Uranus)|(Jupiter)|(Saturn)|(Pluto)]+.*$/d' Neptune.csv
15+
sed -i '/^.*Poseidon.*$/d' Neptune.csv
16+
sed -i '/^.*MC[-–]*ICP.*$/d' Neptune.csv
17+
sed -i '/^.*NSB.*$/d' Neptune.csv

0 commit comments

Comments
 (0)