Skip to content

Commit 9b222a8

Browse files
committed
marker set confidence added
1 parent e3ff30c commit 9b222a8

19 files changed

+2211
-479
lines changed

src/ontology/bdso-edit.owl

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -45,6 +45,7 @@ Declaration(AnnotationProperty(<http://purl.obolibrary.org/obo/PCL_0010058>))
4545
Declaration(AnnotationProperty(<http://purl.obolibrary.org/obo/PCL_0010059>))
4646
Declaration(AnnotationProperty(<http://purl.obolibrary.org/obo/PCL_0010060>))
4747
Declaration(AnnotationProperty(<http://purl.obolibrary.org/obo/PCL_0010061>))
48+
Declaration(AnnotationProperty(<http://purl.obolibrary.org/obo/PCL_0010062>))
4849
Declaration(AnnotationProperty(dce:contributor))
4950
Declaration(AnnotationProperty(dce:description))
5051
Declaration(AnnotationProperty(dce:title))
@@ -102,6 +103,10 @@ AnnotationAssertion(rdfs:label <http://purl.obolibrary.org/obo/PCL_0010060> "cel
102103

103104
AnnotationAssertion(rdfs:label <http://purl.obolibrary.org/obo/PCL_0010061> "nuclei_count")
104105

106+
# Annotation Property: <http://purl.obolibrary.org/obo/PCL_0010062> (fbeta_confidence_score)
107+
108+
AnnotationAssertion(rdfs:label <http://purl.obolibrary.org/obo/PCL_0010062> "fbeta_confidence_score")
109+
105110

106111
############################
107112
# Object Properties

src/patterns/data/default/CCN201912131_marker_set.tsv

Lines changed: 142 additions & 142 deletions
Large diffs are not rendered by default.

src/patterns/data/default/CCN201912132_marker_set.tsv

Lines changed: 106 additions & 106 deletions
Large diffs are not rendered by default.

src/patterns/data/default/CCN202002013_marker_set.tsv

Lines changed: 150 additions & 150 deletions
Large diffs are not rendered by default.

src/patterns/data/default/CS1908210_marker_set.tsv

Lines changed: 76 additions & 76 deletions
Large diffs are not rendered by default.

src/patterns/dosdp-patterns/taxonomy_marker_set.yaml

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -22,6 +22,10 @@ data_vars:
2222
Marker_set_of: "xsd:string"
2323
Brain_region_abbv: "xsd:string"
2424
Species_abbv: "xsd:string"
25+
FBeta_confidence_score: "xsd:double"
26+
27+
annotationProperties:
28+
fbetaConfidenceScore: "PCL:0010062"
2529

2630
name:
2731
text: "NS forest marker set of %s %s (%s)."
@@ -46,3 +50,9 @@ logical_axioms:
4650
# text: "'has_anatomical_context' some %s"
4751
# vars:
4852
# - Brain_region
53+
54+
annotations:
55+
- annotationProperty: fbetaConfidenceScore
56+
text: "%s"
57+
vars:
58+
- FBeta_confidence_score

src/scripts/marker_tools.py

Lines changed: 54 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,7 @@
44
import logging
55
import os
66

7-
from template_generation_utils import get_root_nodes, read_taxonomy_config, generate_dendrogram_tree
7+
from template_generation_utils import get_root_nodes, read_taxonomy_config, generate_dendrogram_tree, index_dendrogram, read_csv_to_dict
88
from dendrogram_tools import dend_json_2_nodes_n_edges
99
from nomenclature_tools import nomenclature_2_nodes_n_edges
1010

@@ -190,3 +190,56 @@ def generate_marker_table(marker_data, output_filepath):
190190
template.append(d)
191191
class_robot_template = pd.DataFrame.from_records(template)
192192
class_robot_template.to_csv(output_filepath.replace("CCN", "CS"), sep="\t", index=False)
193+
194+
195+
def get_nsforest_confidences(taxon, dend, ns_forest_marker_file):
196+
"""
197+
Each NS forest marker files has a non-standard tabular structure. This function aligns cluster names mentioned in
198+
the marker file with the dendrogram nodes and prepares a dict of accession_id - confidence_score.
199+
Args:
200+
taxon: taxonomy id
201+
dend: dendrogram file
202+
ns_forest_marker_file: path of the marker file
203+
"""
204+
nomenclature_indexes = [
205+
index_dendrogram(dend, id_field_name="cell_set_preferred_alias", id_to_lower=True),
206+
index_dendrogram(dend, id_field_name="cell_set_accession", id_to_lower=True),
207+
index_dendrogram(dend, id_field_name="original_label", id_to_lower=True),
208+
index_dendrogram(dend, id_field_name="cell_set_additional_aliases", id_to_lower=True)
209+
]
210+
211+
if taxon != "CS1908210":
212+
nomenclature_indexes.append(index_dendrogram(dend, id_field_name="cell_set_aligned_alias", id_to_lower=True))
213+
214+
confidence_map = dict()
215+
216+
if os.path.isfile(ns_forest_marker_file):
217+
headers, raw_marker_data = read_csv_to_dict(ns_forest_marker_file, id_column_name="clusterName")
218+
else:
219+
# human mtg file extension is outlier
220+
headers, raw_marker_data = read_csv_to_dict(str(ns_forest_marker_file).replace(".csv", ".tsv"),
221+
id_column_name="clusterName", delimiter="\t")
222+
223+
for cluster_name in raw_marker_data:
224+
cluster_name_variants = [cluster_name.lower(), cluster_name.lower().replace("-", "/"),
225+
cluster_name.replace("Micro", "Microglia").lower(),
226+
("(Mouse " + cluster_name + ")-like").lower(),
227+
("(Mouse " + cluster_name.replace("-", "/") + ")-like").lower()]
228+
229+
nomenclature_node = search_terms_in_index(cluster_name_variants, nomenclature_indexes)
230+
if nomenclature_node:
231+
node_id = nomenclature_node["cell_set_accession"]
232+
confidence_map[node_id] = raw_marker_data[cluster_name]["f-measure"]
233+
else:
234+
raise ValueError("Node with cluster name '{}' couldn't be found in the nomenclature of {}."
235+
.format(cluster_name, taxon))
236+
237+
return confidence_map
238+
239+
240+
def search_terms_in_index(term_variants, indexes):
241+
for term in term_variants:
242+
for index in indexes:
243+
if term in index:
244+
return index[term]
245+
return None

src/scripts/template_generation_tools.py

Lines changed: 12 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,7 @@
1010
extract_taxonomy_name_from_path
1111
from nomenclature_tools import nomenclature_2_nodes_n_edges
1212
from pcl_id_factory import get_class_id, get_individual_id, get_taxonomy_id, get_dataset_id, get_marker_gene_set_id
13+
from marker_tools import get_nsforest_confidences
1314

1415
log = logging.getLogger(__name__)
1516

@@ -28,6 +29,7 @@
2829
ALLEN_DESCRIPTIONS_PATH = "{}/{}/All Descriptions_{}.json"
2930
DATASET_INFO_CSV = "{}/{}/{}_landingpage_dataset_info.csv"
3031
TAXONOMY_INFO_CSV = "{}/{}/{}_Taxonomy_Info_Panel.csv"
32+
NSFOREST_MARKER_CSV = "{}/NSForestMarkers/{}_{}_NSForest_Markers.csv"
3133

3234
EXPRESSION_SEPARATOR = "|"
3335

@@ -513,7 +515,7 @@ def generate_datasets_template(centralized_data_folder, output_filepath):
513515
.format(taxonomy_id, expected_file_name))
514516

515517

516-
def generate_marker_gene_set_template(taxonomy_file_path, output_filepath):
518+
def generate_marker_gene_set_template(taxonomy_file_path, centralized_data_folder, output_filepath):
517519
taxon = extract_taxonomy_name_from_path(taxonomy_file_path)
518520
taxonomy_config = read_taxonomy_config(taxon)
519521

@@ -529,17 +531,22 @@ def generate_marker_gene_set_template(taxonomy_file_path, output_filepath):
529531
gene_db_path = ENSEMBLE_PATH.format(str(taxonomy_config["Reference_gene_list"][0]).strip().lower())
530532
gene_names = read_gene_data(gene_db_path)
531533
minimal_markers = read_markers(MARKER_PATH.format(taxon.replace("CCN", "").replace("CS", "")), gene_names)
532-
533534
else:
534535
minimal_markers = {}
535536

537+
ns_forest_marker_file = NSFOREST_MARKER_CSV.format(centralized_data_folder,
538+
taxonomy_config['Species_abbv'][0],
539+
taxonomy_config['Brain_region_abbv'][0])
540+
confidences = get_nsforest_confidences(taxon, dend, ns_forest_marker_file)
541+
536542
class_seed = ['defined_class',
537543
'Marker_set_of',
538544
'Minimal_markers',
539545
'Brain_region_abbv',
540546
'Species_abbv',
541547
'Brain_region',
542-
'Parent'
548+
'Parent',
549+
'FBeta_confidence_score'
543550
]
544551
class_template = []
545552

@@ -557,6 +564,8 @@ def generate_marker_gene_set_template(taxonomy_file_path, output_filepath):
557564
d['Species_abbv'] = taxonomy_config['Species_abbv'][0]
558565
d['Brain_region'] = taxonomy_config['Brain_region'][0]
559566
d['Parent'] = "SO:0001260" # sequence collection
567+
if o['cell_set_accession'] in confidences:
568+
d['FBeta_confidence_score'] = confidences[o['cell_set_accession']]
560569

561570
for k in class_seed:
562571
if not (k in d.keys()):

src/scripts/template_runner.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -58,6 +58,6 @@
5858
elif args.tx:
5959
generate_taxonomies_template(args.input, args.output)
6060
elif args.ms:
61-
generate_marker_gene_set_template(args.input, args.output)
61+
generate_marker_gene_set_template(args.input, args.input2, args.output)
6262
else:
6363
generate_ind_template(args.input, args.input2, args.output)

src/test/marker_tools_test.py

Lines changed: 55 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,55 @@
1+
import unittest
2+
import os
3+
from marker_tools import get_nsforest_confidences
4+
from dendrogram_tools import dend_json_2_nodes_n_edges
5+
from nomenclature_tools import nomenclature_2_nodes_n_edges
6+
7+
current_dir = os.path.dirname(os.path.realpath(__file__))
8+
PATH_MOUSE_NOMENCLATURE = os.path.join(current_dir, "../dendrograms/nomenclature_table_CCN202002013.csv")
9+
PATH_HUMAN_M1_NOMENCLATURE = os.path.join(current_dir, "../dendrograms/nomenclature_table_CCN201912131.csv")
10+
PATH_MARMOSET_M1_NOMENCLATURE = os.path.join(current_dir, "../dendrograms/nomenclature_table_CCN201912132.csv")
11+
PATH_HUMAN_MTG_NOMENCLATURE = os.path.join(current_dir, "../dendrograms/CS1908210.json")
12+
PATH_CENTRALIZED_DATA = os.path.join(current_dir, "./test_data/centralized_data/MOp_taxonomies_ontology/")
13+
14+
class MarkerToolsTest(unittest.TestCase):
15+
16+
def test_get_nsforest_confidences_mouse(self):
17+
dend = nomenclature_2_nodes_n_edges(PATH_MOUSE_NOMENCLATURE)
18+
marker_path = os.path.join(PATH_CENTRALIZED_DATA, "NSForestMarkers/Mouse_MOp_NSForest_Markers.csv")
19+
confidences = get_nsforest_confidences("CCN202002013", dend, marker_path)
20+
21+
self.assertEqual("0.988920549", confidences["CS202002013_179"])
22+
self.assertEqual("0.607476636", confidences["CS202002013_3"])
23+
24+
def test_get_nsforest_confidences_human_m1(self):
25+
dend = nomenclature_2_nodes_n_edges(PATH_HUMAN_M1_NOMENCLATURE)
26+
marker_path = os.path.join(PATH_CENTRALIZED_DATA, "NSForestMarkers/Human_M1_NSForest_Markers.csv")
27+
confidences = get_nsforest_confidences("CS201912131", dend, marker_path)
28+
29+
self.assertEqual("0.989133387", confidences["CS201912131_150"]) # Glutamatergic
30+
self.assertEqual("0.689189189", confidences["CS201912131_69"]) # Inh L5-6 PVALB GAPDHP60
31+
self.assertEqual("0.825020189", confidences["CS201912131_134"]) # (Mouse L5 IT)-like
32+
self.assertFalse("CS201912131_145" in confidences) # (Mouse IT projecting)-like
33+
self.assertFalse("CS201912131_162" in confidences) # (Mouse Vip)-like_C4
34+
35+
def test_get_nsforest_confidences_marmoset_m1(self):
36+
dend = nomenclature_2_nodes_n_edges(PATH_MARMOSET_M1_NOMENCLATURE)
37+
marker_path = os.path.join(PATH_CENTRALIZED_DATA, "NSForestMarkers/Marmoset_M1_NSForest_Markers.csv")
38+
confidences = get_nsforest_confidences("CS201912132", dend, marker_path)
39+
40+
self.assertEqual("0.428954424", confidences["CS201912132_79"]) # Astro FGFR3 EPHB1
41+
self.assertEqual("0.759294697", confidences["CS201912132_104"]) # (Mouse L6 CT)-like
42+
self.assertEqual("0.7227388", confidences["CS201912132_108"]) # Peri
43+
self.assertEqual("0.790527018", confidences["CS201912132_109"]) # VLMC
44+
self.assertEqual("0.889261745", confidences["CS201912132_93"]) # VLMC SLC1A3-like SLC47A1
45+
self.assertFalse("CS201912132_113" in confidences) # (Mouse Non-IT projecting)-like
46+
self.assertFalse("CS201912132_135" in confidences) # All Cells
47+
48+
def test_get_nsforest_confidences_human_mtg(self):
49+
dend = dend_json_2_nodes_n_edges(PATH_HUMAN_MTG_NOMENCLATURE)
50+
marker_path = os.path.join(PATH_CENTRALIZED_DATA, "NSForestMarkers/Human_MTG_NSForest_Markers.tsv")
51+
confidences = get_nsforest_confidences("CS1908210", dend, marker_path)
52+
53+
self.assertEqual("0.55", confidences["CS1908210011"]) # Inh L1-2 VIP TSPAN12
54+
self.assertEqual("0.58", confidences["CS1908210059"]) # Exc L4-6 RORB C1R
55+
self.assertEqual("0.91", confidences["CS1908210075"]) # Micro L1-6 TYROBP

0 commit comments

Comments
 (0)