Skip to content

Commit 5b99c6c

Browse files
committed
Merge branch 'master' into pcl_migration
# Conflicts: # src/dendrograms/taxonomy_details.yaml # src/markers/CS202002013_markers.tsv
2 parents 4390d85 + a3de155 commit 5b99c6c

File tree

10 files changed

+516
-30
lines changed

10 files changed

+516
-30
lines changed
Lines changed: 25 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,25 @@
1+
name: Validate Dendrograms
2+
3+
# Since this validation fails, added as a separate action and disabled. I future can be integrated to main build task.
4+
on:
5+
# Triggers the workflow on push or pull request events but only for the master branch
6+
push:
7+
paths:
8+
- 'src/dendrograms/**.json'
9+
- '.github/workflows/dendrogram_check.yaml'
10+
11+
# Allows you to run this workflow manually from the Actions tab
12+
workflow_dispatch:
13+
14+
jobs:
15+
validate-dendrograms:
16+
runs-on: ubuntu-latest
17+
18+
steps:
19+
- uses: actions/checkout@v2
20+
- name: install dependencies
21+
run: |
22+
python -m pip install --upgrade pip
23+
pip install -r requirements.txt
24+
- name: validate dendrograms
25+
run: python ./src/scripts/dendrogram_validator.py

README.md

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,9 @@ A repository for building ontologies for the Brain Data Standards Project.
55

66
Status: Draft
77

8+
### Cite:
9+
[BioRxiv Preprint](https://www.biorxiv.org/content/10.1101/2021.10.10.463703)
10+
811
### Overview:
912

1013
The main purpose of this repo is to automate data driven cell-type ontology development for the Brain Data Standards initiative. The main inputs are:
@@ -100,6 +103,9 @@ Markers are referenced by enembl ID using an [identifiers.org URL scheme](https:
100103

101104
ensembl gene file templates are used to generate mirror files, which act as source files for import generation, so that only referenced markers end up in the release files.
102105

106+
### Reference Gene Files
107+
108+
GTF files used as reference for BDSO can be found in this [google drive folder](https://drive.google.com/drive/folders/1rOYwiIxGgEolWsO3a-7g6rxUefsXIcKB)
103109

104110

105111

src/config/config_schema.json

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -90,7 +90,7 @@
9090
},
9191
"Location_relation": {
9292
"type": "string",
93-
"description": ""
93+
"description": "The relation to use to link cell type to brain region. See guidance on CL repo. Rule of thumb: For neurons use has_soma_location. For other cell types use part_of."
9494
}
9595
}
9696
},

src/markers/CS202002013_markers.txt

Lines changed: 231 additions & 0 deletions
Large diffs are not rendered by default.
Lines changed: 64 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,64 @@
1+
import logging
2+
import os
3+
from dendrogram_tools import dend_json_2_nodes_n_edges
4+
from abc import ABC, abstractmethod
5+
from os.path import isfile, join
6+
7+
log = logging.getLogger(__name__)
8+
9+
DENDROGRAM_FOLDER = join(os.path.dirname(os.path.realpath(__file__)), "../dendrograms")
10+
11+
12+
class BaseChecker(ABC):
13+
14+
@abstractmethod
15+
def check(self, dend_file, dendrogram):
16+
pass
17+
18+
19+
class PrefAliasUniquenessChecker(BaseChecker):
20+
"""
21+
cell_set_preferred_alias should be unique within any one dendrogram - ignoring nodes with no
22+
cell_set_preferred_alias, no two nodes should have the same one
23+
"""
24+
25+
def __init__(self):
26+
self.reports = []
27+
28+
def check(self, dend_file, dendrogram):
29+
pref_aliases = list()
30+
is_valid = True
31+
for o in dendrogram['nodes']:
32+
if o['cell_set_preferred_alias']:
33+
if o['cell_set_preferred_alias'] not in pref_aliases:
34+
pref_aliases.append(o['cell_set_preferred_alias'])
35+
else:
36+
is_valid = False
37+
log.error("cell_set_preferred_alias '{}' is duplicate in {}"
38+
.format(o['cell_set_preferred_alias'], dend_file))
39+
return is_valid
40+
41+
42+
class ValidationError(Exception):
43+
44+
def __init__(self, message):
45+
Exception.__init__(self)
46+
self.message = message
47+
48+
49+
def main():
50+
log.info("Dendrogram validation started.")
51+
files = [f for f in os.listdir(DENDROGRAM_FOLDER) if isfile(join(DENDROGRAM_FOLDER, f))]
52+
is_valid = True
53+
for file in files:
54+
filename, file_extension = os.path.splitext(file)
55+
if file_extension == ".json":
56+
dend = dend_json_2_nodes_n_edges(join(DENDROGRAM_FOLDER, file))
57+
is_valid &= PrefAliasUniquenessChecker().check(filename, dend)
58+
59+
if not is_valid:
60+
raise ValidationError("Dendrogram validation failed and issues logged.")
61+
62+
63+
if __name__ == '__main__':
64+
main()
Lines changed: 188 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,188 @@
1+
import logging
2+
import networkx as nx
3+
import os
4+
import matplotlib.pyplot as plt
5+
from networkx.drawing.nx_agraph import graphviz_layout
6+
from networkx.algorithms.traversal.depth_first_search import dfs_tree
7+
from dendrogram_tools import dend_json_2_nodes_n_edges
8+
from template_generation_utils import read_dendrogram_tree, index_dendrogram, generate_dendrogram_tree
9+
from marker_tools import read_marker_file
10+
from nomenclature_tools import nomenclature_2_nodes_n_edges
11+
12+
TAXON = "CS202002013"
13+
# TAXON = "CS201912132"
14+
15+
NODE_LABEL_DISPLACEMENT = 1200
16+
17+
NODE_Y_DISPLACEMENT = 300
18+
19+
GAP_BETWEEN_LEAFS = 500
20+
21+
INTERMEDIATE_NODE_SIZE = 300
22+
23+
LEAF_NODE_SIZE = 50
24+
25+
PATH_DEND_JSON = os.path.join(os.path.dirname(os.path.realpath(__file__)), "../dendrograms/" + TAXON + ".json")
26+
27+
PATH_NMN_TABLE = os.path.join(os.path.dirname(os.path.realpath(__file__)), "../dendrograms/nomenclature_table_CCN202002013.csv")
28+
29+
PATH_MARKERS = os.path.join(os.path.dirname(os.path.realpath(__file__)), "../markers/" + TAXON + "_markers.tsv")
30+
31+
CLUSTER_ORDER = ["Lamp5", "Sncg", "Vip", "Sst", "Pvalb", "L2", "L4", "L5", "L6", "Meis", "OPC", "Astro", "Oligo",
32+
"Endo", "VLMC", "SMC", "Peri", "Micro", "PVM"]
33+
34+
35+
def visualise_tree(root=None, exact_order=True):
36+
# tree = read_dendrogram_tree(PATH_DEND_JSON)
37+
dend = nomenclature_2_nodes_n_edges(PATH_NMN_TABLE)
38+
tree = generate_dendrogram_tree(dend)
39+
40+
if root is not None:
41+
tree = dfs_tree(tree, root)
42+
marker_expressions = read_marker_file(PATH_MARKERS)
43+
44+
add_label_metadata(tree)
45+
node_colors, node_labels, node_sizes, pos = decorate_nodes(marker_expressions, tree)
46+
47+
all_leafs = [x for x in tree.nodes(data=True) if tree.out_degree(x[0]) == 0]
48+
49+
position_leaf_nodes(all_leafs, exact_order, pos)
50+
position_intermediate_nodes(all_leafs, pos, tree)
51+
52+
# nx.draw_networkx(tree, pos, node_color=color_map, with_labels=False, arrows=False)
53+
nx.draw_networkx_nodes(tree, pos, node_color=node_colors, node_size=node_sizes)
54+
nx.draw_networkx_edges(tree, pos, arrows=True, connectionstyle="arc3,rad=0.1")
55+
text = nx.draw_networkx_labels(tree, pos, node_labels, font_size=7)
56+
rotate_leaf_labels(text)
57+
58+
plt.show()
59+
60+
61+
def position_leaf_nodes(all_leaves, exact_order, pos):
62+
if exact_order:
63+
# sort by accession_id increasing
64+
leaves = list(all_leaves)
65+
list.sort(leaves, key=lambda node: int(str(node[0]).replace(TAXON + "_", "")))
66+
leaf_order = list()
67+
for node in leaves:
68+
leaf_order.append(node[1]["label"])
69+
70+
print(leaf_order)
71+
else:
72+
leaf_order = CLUSTER_ORDER
73+
74+
min_depth = get_min_depth(all_leaves, pos)
75+
last_x = 0
76+
for cluster in leaf_order:
77+
last_x = position_cluster_leafs(cluster, pos, all_leaves, min_depth, last_x)
78+
79+
80+
def position_cluster_leafs(cluster, pos, all_leafs, min_depth, last_x=2):
81+
cluster_leafs = [x for x in all_leafs if x[1]["label"].startswith(cluster)]
82+
if len(cluster_leafs) == 0:
83+
logging.error("Node '" + cluster + "' that exists in reference order, not exists in the dendrogram.")
84+
return last_x
85+
cluster_leafs = sorted(cluster_leafs, key=lambda cluster_leaf: pos[cluster_leaf[0]][0])
86+
87+
for leaf in cluster_leafs:
88+
# print(leaf[0]+" "+str(last_x))
89+
pos[leaf[0]] = (last_x, min_depth)
90+
last_x += GAP_BETWEEN_LEAFS
91+
92+
return last_x
93+
94+
95+
def add_label_metadata(tree):
96+
# out = dend_json_2_nodes_n_edges(PATH_DEND_JSON)
97+
out = nomenclature_2_nodes_n_edges(PATH_NMN_TABLE)
98+
dend_dict = index_dendrogram(out)
99+
for node in tree.nodes(data=True):
100+
dend_node = dend_dict[node[0]]
101+
if "cell_set_preferred_alias" in dend_node:
102+
node[1]["label"] = dend_node["cell_set_preferred_alias"]
103+
104+
105+
def decorate_nodes(marker_expressions, tree):
106+
labels = {}
107+
color_map = []
108+
node_sizes = []
109+
for node in tree.nodes(data=True):
110+
if tree.out_degree(node[0]) == 0:
111+
node_id = str(node[0]).replace(TAXON + "_", "")
112+
labels[node[0]] = node[1]["label"] + " (" + node_id + ")"
113+
node_sizes.append(LEAF_NODE_SIZE)
114+
else:
115+
labels[node[0]] = str(node[0]).replace(TAXON, "")
116+
node_sizes.append(INTERMEDIATE_NODE_SIZE)
117+
118+
# nodes that also exist in the marker file will be displayed as red, others as blue
119+
if str(node[0]) in marker_expressions.keys():
120+
# light red
121+
color_map.append('#F08080')
122+
else:
123+
# sky blue
124+
color_map.append('#00BFFF')
125+
plt.title(TAXON)
126+
pos = graphviz_layout(tree, prog='dot')
127+
return color_map, labels, node_sizes, pos
128+
129+
130+
def position_intermediate_nodes(all_leafs, pos, tree):
131+
intermediate_nodes = [x for x in tree.nodes(data=True) if x not in all_leafs]
132+
intermediate_nodes = sorted(intermediate_nodes, key=lambda node: pos[node[0]][1])
133+
for node in intermediate_nodes:
134+
min_x = 99999
135+
max_x = 0
136+
max_y = 0
137+
descendants = tree.successors(node[0])
138+
for descendant in descendants:
139+
if pos[descendant][0] > max_x:
140+
max_x = pos[descendant][0]
141+
if pos[descendant][0] < min_x:
142+
min_x = pos[descendant][0]
143+
if pos[descendant][1] > max_y:
144+
max_y = pos[descendant][1]
145+
146+
pos[node[0]] = ((min_x + max_x) / 2, max_y + NODE_Y_DISPLACEMENT)
147+
148+
fix_intemediate_overlaps(intermediate_nodes, pos)
149+
150+
151+
def fix_intemediate_overlaps(intermediate_nodes, pos, changed=False):
152+
intermediate_nodes = sorted(intermediate_nodes, key=lambda node: pos[node[0]][0])
153+
154+
previous_x = -1000
155+
previous_y = -1000
156+
for node in intermediate_nodes:
157+
if previous_y - pos[node[0]][1] < 50:
158+
if pos[node[0]][0] - previous_x < INTERMEDIATE_NODE_SIZE:
159+
displacement = INTERMEDIATE_NODE_SIZE + 200 - (pos[node[0]][0] - previous_x)
160+
pos[node[0]] = (pos[node[0]][0] + displacement, pos[node[0]][1])
161+
changed = True
162+
previous_x = pos[node[0]][0]
163+
previous_y = pos[node[0]][1]
164+
165+
if changed:
166+
fix_intemediate_overlaps(intermediate_nodes, pos)
167+
168+
169+
def rotate_leaf_labels(text):
170+
for node, t in text.items():
171+
if not str(t._text).startswith("_"):
172+
t.set_rotation('vertical')
173+
t._y = t._y - NODE_LABEL_DISPLACEMENT
174+
t._verticalalignment = 'right'
175+
176+
177+
def get_min_depth(all_leafs, pos):
178+
min_depth = 99999
179+
for node in all_leafs:
180+
# print(str(pos[node[0]][0]))
181+
if pos[node[0]][1] < min_depth:
182+
min_depth = pos[node[0]][1]
183+
return min_depth
184+
185+
186+
visualise_tree()
187+
#visualise_tree("CS202002013_123")
188+
#visualise_tree("CS202002013_179")

src/test/denormalised_marker_generation_test.py

Lines changed: 1 addition & 29 deletions
Original file line numberDiff line numberDiff line change
@@ -1,12 +1,8 @@
11
import unittest
22
import networkx as nx
33
import os
4-
# import matplotlib.pyplot as plt
5-
from networkx.drawing.nx_agraph import graphviz_layout
64
from template_generation_utils import read_dendrogram_tree
7-
from marker_tools import generate_denormalised_marker, read_marker_file, \
8-
extend_expressions
9-
5+
from marker_tools import read_marker_file, extend_expressions
106

117
PATH_DEND_JSON = os.path.join(os.path.dirname(os.path.realpath(__file__)), "./test_data/CCN202002013.json")
128

@@ -23,30 +19,6 @@ def delete_file(path_to_file):
2319
os.remove(path_to_file)
2420

2521

26-
# def visualise_tree():
27-
# tree = read_dendrogram_tree(PATH_DEND_JSON)
28-
# marker_expressions = read_marker_file(PATH_MARKERS)
29-
#
30-
# labels = {}
31-
# color_map = []
32-
# for node in tree.nodes():
33-
# labels[node] = str(node).replace("CS202002013", "")
34-
# # nodes that also exist in the marker file will be displayed as red, others as blue
35-
# if str(node) in marker_expressions.keys():
36-
# # light red
37-
# color_map.append('#F08080')
38-
# else:
39-
# # sky blue
40-
# color_map.append('#00BFFF')
41-
#
42-
# plt.title('CCN202002013')
43-
# pos = graphviz_layout(tree, prog='dot')
44-
# nx.draw(tree, pos, node_color=color_map, with_labels=False, arrows=False)
45-
# nx.draw_networkx_labels(tree, pos, labels, font_size=7)
46-
#
47-
# plt.show()
48-
49-
5022
class DenormalisedMarkerTest(unittest.TestCase):
5123

5224
def test_tree_descendants(self):
82 KB
Loading
201 KB
Loading
132 KB
Loading

0 commit comments

Comments
 (0)