Skip to content

Commit 985f56e

Browse files
committed
Switching to use Allen Institute Repo files #181
1 parent 0ccd705 commit 985f56e

File tree

7 files changed

+140
-75
lines changed

7 files changed

+140
-75
lines changed

src/dendrograms/Makefile

Lines changed: 19 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,10 @@
11
JOBS = CCN202002013 CCN201912131 CCN201912132 CS1908210 # CCN202002270 CCN201908210 CCN201810310 CCN201908211
22

3+
# GIT related configurations
4+
GIT_REPO= https://github.com/AllenInstitute/MOp_taxonomies_ontology.git
5+
REPO_NAME= MOp_taxonomies_ontology
6+
CENTRALIZED_DIR= centralized_data
7+
38
TEMPLATE_FILES = $(patsubst %, ../templates/%.tsv, $(JOBS))
49
MARKER_DENORMALIZED_FILES = $(patsubst %, ../markers/%_markers_denormalized.tsv, $(JOBS))
510
TEMPLATE_CLASS_BASE_FILES = $(patsubst %, ../patterns/data/default/%_class_base.tsv, $(JOBS))
@@ -10,13 +15,11 @@ DATASET_TEMPLATE_FILES = $(patsubst %, ../templates/%_dataset.tsv, $(JOBS))
1015
MARKER_SET_TEMPLATE_FILES = $(patsubst %, ../patterns/data/default/%_marker_set.tsv, $(JOBS))
1116
TAXONOMY_TEMPLATE_FILE = ../templates/Taxonomies.tsv
1217

13-
14-
15-
all: $(TEMPLATE_FILES) $(TEMPLATE_CLASS_BASE_FILES) $(TEMPLATE_CLASS_CURATION_FILES) $(MARKER_DENORMALIZED_FILES) $(TAXONOMY_TEMPLATE_FILE) $(APP_SPECIFIC_TEMPLATE_FILES) $(TEMPLATE_CLASS_HOMOLOGOUS_FILES) $(DATASET_TEMPLATE_FILES) $(MARKER_SET_TEMPLATE_FILES)
18+
all: update_centralized_repo $(TEMPLATE_FILES) $(TEMPLATE_CLASS_BASE_FILES) $(TEMPLATE_CLASS_CURATION_FILES) $(MARKER_DENORMALIZED_FILES) $(TAXONOMY_TEMPLATE_FILE) $(APP_SPECIFIC_TEMPLATE_FILES) $(TEMPLATE_CLASS_HOMOLOGOUS_FILES) $(DATASET_TEMPLATE_FILES) $(MARKER_SET_TEMPLATE_FILES)
1619

1720
../templates/%.tsv: %.json nomenclature_table_%.csv
18-
if [ $< = CS1908210.json ]; then python ../scripts/template_runner.py generator -i $< -o $@ ;\
19-
else python ../scripts/template_runner.py generator -i $(word 2, $^) -o $@ ; fi
21+
if [ $< = CS1908210.json ]; then python ../scripts/template_runner.py generator -i $< -i2 $(CENTRALIZED_DIR)/$(REPO_NAME) -o $@ ;\
22+
else python ../scripts/template_runner.py generator -i $(word 2, $^) -i2 $(CENTRALIZED_DIR)/$(REPO_NAME) -o $@ ; fi
2023

2124
../markers/%_markers_denormalized.tsv: %.json nomenclature_table_%.csv
2225
if [ $< = CS1908210.json ]; then python ../scripts/template_runner.py generator -md -i $< -o $@ ;\
@@ -38,13 +41,20 @@ all: $(TEMPLATE_FILES) $(TEMPLATE_CLASS_BASE_FILES) $(TEMPLATE_CLASS_CURATION_FI
3841
if [ $< = CS1908210.json ]; then python ../scripts/template_runner.py generator -a -i $< -o $@ ;\
3942
else python ../scripts/template_runner.py generator -a -i $(word 2, $^) -o $@ ; fi
4043

41-
../templates/%_dataset.tsv: MOp_taxonomies_ontology/%_landingpage_dataset_info.csv
42-
python ../scripts/template_runner.py generator -ds -i $< -o $@
43-
4444
../patterns/data/default/%_marker_set.tsv: %.json nomenclature_table_%.csv
4545
if [ $< = CS1908210.json ]; then python ../scripts/template_runner.py generator -ms -i $< -o $@ ;\
4646
else python ../scripts/template_runner.py generator -ms -i $(word 2, $^) -o $@ ; fi
4747

48-
../templates/Taxonomies.tsv: MOp_taxonomies_ontology/Taxonomy_Info_Panel.csv
48+
../templates/%_dataset.tsv: $(CENTRALIZED_DIR)/$(REPO_NAME)
49+
python ../scripts/template_runner.py generator -ds -i $< -o $@
50+
51+
../templates/Taxonomies.tsv: $(CENTRALIZED_DIR)/$(REPO_NAME)
4952
python ../scripts/template_runner.py generator -tx -i $< -o $@
5053

54+
55+
# Git centralized repo retrieval
56+
update_centralized_repo: clean_centralized_data_folder
57+
cd $(CENTRALIZED_DIR) && git clone --quiet ${GIT_REPO} && rm -rf $(REPO_NAME)/.git
58+
59+
clean_centralized_data_folder:
60+
rm -rf $(CENTRALIZED_DIR)/$(REPO_NAME)

src/ontology/bdso-edit.owl

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -49,6 +49,10 @@ Declaration(AnnotationProperty(dce:contributor))
4949
Declaration(AnnotationProperty(dce:description))
5050
Declaration(AnnotationProperty(dce:title))
5151
Declaration(AnnotationProperty(dcterms:license))
52+
Declaration(AnnotationProperty(dcterms:title))
53+
Declaration(AnnotationProperty(dcterms:provenance))
54+
Declaration(AnnotationProperty(dcterms:description))
55+
Declaration(AnnotationProperty(dcterms:subject))
5256
############################
5357
# Annotation Properties
5458
############################

src/ontology/template_prefixes.json

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,7 @@
1010
"n2o": "http://n2o.neo/custom/",
1111
"PR": "http://purl.obolibrary.org/obo/PR_",
1212
"jcvi": "http://www.jcvi.org/framework/nsf2_full_mtg#",
13-
"schema": "http://schema.org/"
13+
"schema": "http://schema.org/",
14+
"dcterms": "http://purl.org/dc/terms/"
1415
}
1516
}

src/ontology/template_prefixes.yaml

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -9,4 +9,5 @@ CL: http://purl.obolibrary.org/obo/CL_
99
n2o: http://n2o.neo/custom/
1010
PR: http://purl.obolibrary.org/obo/PR_
1111
jcvi: http://www.jcvi.org/framework/nsf2_full_mtg#
12-
schema: http://schema.org/
12+
schema: http://schema.org/
13+
dcterms: http://purl.org/dc/terms/

src/scripts/template_generation_tools.py

Lines changed: 107 additions & 57 deletions
Original file line numberDiff line numberDiff line change
@@ -11,7 +11,6 @@
1111
from nomenclature_tools import nomenclature_2_nodes_n_edges
1212
from pcl_id_factory import get_class_id, get_individual_id, get_taxonomy_id, get_dataset_id, get_marker_gene_set_id
1313

14-
1514
log = logging.getLogger(__name__)
1615

1716
PCL_BASE = 'http://purl.obolibrary.org/obo/PCL_'
@@ -24,13 +23,16 @@
2423
ENSEMBLE_PATH = os.path.join(os.path.dirname(os.path.realpath(__file__)), "../templates/{}.tsv")
2524
CROSS_SPECIES_PATH = os.path.join(os.path.dirname(os.path.realpath(__file__)),
2625
"../dendrograms/nomenclature_table_CCN202002270.csv")
27-
ALLEN_DESCRIPTIONS_PATH = os.path.join(os.path.dirname(os.path.realpath(__file__)),
28-
'../dendrograms/MOp_taxonomies_ontology/All Descriptions_{}.json')
26+
27+
# centralized data files
28+
ALLEN_DESCRIPTIONS_PATH = "{}/{}/All Descriptions_{}.json"
29+
DATASET_INFO_CSV = "{}/{}/{}_landingpage_dataset_info.csv"
30+
TAXONOMY_INFO_CSV = "{}/{}/{}_Taxonomy_Info_Panel.csv"
2931

3032
EXPRESSION_SEPARATOR = "|"
3133

3234

33-
def generate_ind_template(taxonomy_file_path, output_filepath):
35+
def generate_ind_template(taxonomy_file_path, centralized_data_folder, output_filepath):
3436
path_parts = taxonomy_file_path.split(os.path.sep)
3537
taxon = path_parts[len(path_parts) - 1].split(".")[0]
3638

@@ -42,7 +44,11 @@ def generate_ind_template(taxonomy_file_path, output_filepath):
4244

4345
dend_tree = generate_dendrogram_tree(dend)
4446
taxonomy_config = read_taxonomy_config(taxon)
45-
allen_descriptions = read_allen_descriptions(ALLEN_DESCRIPTIONS_PATH, taxonomy_config['Species_abbv'][0])
47+
48+
taxonomy_folder_name = get_centralized_taxonomy_folder(taxonomy_config)
49+
allen_desc_file = ALLEN_DESCRIPTIONS_PATH.format(centralized_data_folder, taxonomy_folder_name,
50+
taxonomy_config['Species_abbv'][0])
51+
allen_descriptions = read_allen_descriptions(allen_desc_file)
4652

4753
subtrees = get_subtrees(dend_tree, taxonomy_config)
4854

@@ -108,7 +114,6 @@ def generate_ind_template(taxonomy_file_path, output_filepath):
108114
if allen_data["aliases"][0]:
109115
d['Aliases'] = '|'.join([alias.strip() for alias in str(allen_data["aliases"][0]).split("|")])
110116

111-
# There should only be one!
112117
dl.append(d)
113118
robot_template = pd.DataFrame.from_records(dl)
114119
robot_template.to_csv(output_filepath, sep="\t", index=False)
@@ -382,9 +387,8 @@ def generate_cross_species_template(taxonomy_file_path, output_filepath):
382387
class_robot_template.to_csv(output_filepath, sep="\t", index=False)
383388

384389

385-
def generate_taxonomies_template(taxonomy_metadata_path, output_filepath):
390+
def generate_taxonomies_template(centralized_data_folder, output_filepath):
386391
taxon_configs = read_taxonomy_details_yaml()
387-
headers, taxonomies_metadata = read_csv_to_dict(taxonomy_metadata_path)
388392

389393
robot_template_seed = {'ID': 'ID',
390394
'TYPE': 'TYPE',
@@ -397,7 +401,12 @@ def generate_taxonomies_template(taxonomy_metadata_path, output_filepath):
397401
'Species Label': "A skos:prefLabel",
398402
'Age': "A 'has_age'",
399403
'Sex': "A 'has_sex'",
400-
'Primary Citation': "A oboInOwl:hasDbXref"
404+
'Primary Citation': "A oboInOwl:hasDbXref",
405+
'Title': "A dcterms:title",
406+
'Description': "A rdfs:comment",
407+
'Attribution': "A rdfs:provenance",
408+
'SubDescription': "A rdfs:description",
409+
'Anatomy': "A rdfs:subject"
401410
}
402411
dl = [robot_template_seed]
403412

@@ -409,60 +418,89 @@ def generate_taxonomies_template(taxonomy_metadata_path, output_filepath):
409418
d['Label'] = taxon_config["Taxonomy_id"]
410419
d['Anatomic Region'] = taxon_config['Brain_region'][0]
411420
d['Primary Citation'] = taxon_config['PMID'][0]
412-
if taxon_config["Taxonomy_id"] in taxonomies_metadata:
413-
taxonomy_metadata = taxonomies_metadata[taxon_config["Taxonomy_id"]]
414-
d['Number of Cell Types'] = taxonomy_metadata["Cell Types"]
415-
d['Number of Cell Subclasses'] = taxonomy_metadata["Cell Subclasses"]
416-
d['Number of Cell Classes'] = taxonomy_metadata["Cell Classes"]
417-
d['Species Label'] = taxonomy_metadata["Species"]
418-
d['Age'] = taxonomy_metadata["Age"]
419-
d['Sex'] = taxonomy_metadata["Sex"]
421+
422+
add_taxonomy_info_panel_properties(centralized_data_folder, d, taxon_config)
420423

421424
dl.append(d)
422425
robot_template = pd.DataFrame.from_records(dl)
423426
robot_template.to_csv(output_filepath, sep="\t", index=False)
424427

425428

426-
def generate_datasets_template(dataset_metadata_path, output_filepath):
427-
path_parts = dataset_metadata_path.split(os.path.sep)
428-
taxonomy_id = path_parts[len(path_parts) - 1].split("_")[0]
429-
430-
headers, dataset_metadata = read_csv_to_dict(dataset_metadata_path, generated_ids=True)
431-
432-
robot_template_seed = {'ID': 'ID',
433-
'TYPE': 'TYPE',
434-
'Entity Type': 'TI %',
435-
'Label': 'LABEL',
436-
'Taxonomy': 'AI schema:includedInDataCatalog',
437-
'Cell Count': "AT 'cell_count'^^xsd:integer",
438-
'Nuclei Count': "AT 'nuclei_count'^^xsd:integer",
439-
'Description': "A rdfs:comment",
440-
'Download Link': "A schema:archivedAt",
441-
'Explore Link': "A schema:discussionUrl"
442-
}
443-
dl = [robot_template_seed]
444-
445-
dataset_index = 0
446-
for dataset in dataset_metadata:
447-
d = dict()
448-
d['ID'] = 'PCL:' + get_dataset_id(taxonomy_id, dataset_index)
449-
d['TYPE'] = 'owl:NamedIndividual'
450-
d['Entity Type'] = 'schema:Dataset' # Taxonomy
451-
d['Label'] = dataset_metadata[dataset]['Dataset']
452-
d['Taxonomy'] = 'PCL:' + get_taxonomy_id(taxonomy_id)
453-
cells_nuclei = dataset_metadata[dataset]['cells/nuclei']
454-
if 'nuclei' in cells_nuclei:
455-
d['Nuclei Count'] = int(''.join(c for c in cells_nuclei if c.isdigit()))
456-
elif 'cells' in cells_nuclei:
457-
d['Cell Count'] = int(''.join(c for c in cells_nuclei if c.isdigit()))
458-
d['Description'] = dataset_metadata[dataset]['text']
459-
d['Download Link'] = dataset_metadata[dataset]['download_link']
460-
d['Explore Link'] = dataset_metadata[dataset]['explore_link']
461-
462-
dataset_index += 1
463-
dl.append(d)
464-
robot_template = pd.DataFrame.from_records(dl)
465-
robot_template.to_csv(output_filepath, sep="\t", index=False)
429+
def add_taxonomy_info_panel_properties(centralized_data_folder, d, taxon_config):
430+
expected_folder_name = get_centralized_taxonomy_folder(taxon_config)
431+
taxonomy_metadata_path = TAXONOMY_INFO_CSV.format(centralized_data_folder, expected_folder_name,
432+
taxon_config["Taxonomy_id"])
433+
if os.path.isfile(taxonomy_metadata_path):
434+
headers, taxonomies_metadata = read_csv_to_dict(taxonomy_metadata_path)
435+
taxonomy_metadata = taxonomies_metadata[taxon_config["Taxonomy_id"]]
436+
d['Number of Cell Types'] = taxonomy_metadata["Cell Types"]
437+
d['Number of Cell Subclasses'] = taxonomy_metadata["Cell Subclasses"]
438+
d['Number of Cell Classes'] = taxonomy_metadata["Cell Classes"]
439+
d['Species Label'] = taxonomy_metadata["Species"]
440+
d['Age'] = taxonomy_metadata["Age"]
441+
d['Sex'] = taxonomy_metadata["Sex"]
442+
d['Title'] = taxonomy_metadata["header"]
443+
d['Description'] = taxonomy_metadata["mainDescription"]
444+
d['Attribution'] = taxonomy_metadata["attribution"]
445+
d['SubDescription'] = taxonomy_metadata["subDescription"]
446+
d['Anatomy'] = taxonomy_metadata["Anatomy"]
447+
else:
448+
raise ValueError("Couldn't find taxonomy '{}' landingpage dataset info file at: '{}'"
449+
.format(taxon_config["Taxonomy_id"], taxonomy_metadata_path))
450+
451+
452+
def generate_datasets_template(centralized_data_folder, output_filepath):
453+
path_parts = output_filepath.split(os.path.sep)
454+
taxonomy_id = str(path_parts[len(path_parts) - 1]).split("_")[0]
455+
taxonomy_config = read_taxonomy_config(taxonomy_id)
456+
457+
expected_file_name = DATASET_INFO_CSV.format(centralized_data_folder,
458+
get_centralized_taxonomy_folder(taxonomy_config), taxonomy_id)
459+
460+
if os.path.isfile(expected_file_name):
461+
headers, dataset_metadata = read_csv_to_dict(expected_file_name, generated_ids=True)
462+
463+
robot_template_seed = {'ID': 'ID',
464+
'TYPE': 'TYPE',
465+
'Entity Type': 'TI %',
466+
'Label': 'LABEL',
467+
'PrefLabel': 'A skos:prefLabel',
468+
'Symbol': 'A IAO:0000028',
469+
'Taxonomy': 'AI schema:includedInDataCatalog',
470+
'Cell Count': "AT 'cell_count'^^xsd:integer",
471+
'Nuclei Count': "AT 'nuclei_count'^^xsd:integer",
472+
'Description': "A rdfs:comment",
473+
'Download Link': "A schema:archivedAt",
474+
'Explore Link': "A schema:discussionUrl"
475+
}
476+
dl = [robot_template_seed]
477+
478+
dataset_index = 0
479+
for dataset in dataset_metadata:
480+
d = dict()
481+
d['ID'] = 'PCL:' + get_dataset_id(taxonomy_id, dataset_index)
482+
d['TYPE'] = 'owl:NamedIndividual'
483+
d['Entity Type'] = 'schema:Dataset' # Taxonomy
484+
d['Label'] = dataset_metadata[dataset]['Ontology Name']
485+
d['PrefLabel'] = dataset_metadata[dataset]['Dataset']
486+
d['Symbol'] = dataset_metadata[dataset]['Ontology Symbol']
487+
d['Taxonomy'] = 'PCL:' + get_taxonomy_id(taxonomy_id)
488+
cells_nuclei = dataset_metadata[dataset]['cells/nuclei']
489+
if 'nuclei' in cells_nuclei:
490+
d['Nuclei Count'] = int(''.join(c for c in cells_nuclei if c.isdigit()))
491+
elif 'cells' in cells_nuclei:
492+
d['Cell Count'] = int(''.join(c for c in cells_nuclei if c.isdigit()))
493+
d['Description'] = dataset_metadata[dataset]['text']
494+
d['Download Link'] = dataset_metadata[dataset]['download_link']
495+
d['Explore Link'] = dataset_metadata[dataset]['explore_link']
496+
497+
dataset_index += 1
498+
dl.append(d)
499+
robot_template = pd.DataFrame.from_records(dl)
500+
robot_template.to_csv(output_filepath, sep="\t", index=False)
501+
else:
502+
raise ValueError("Couldn't find taxonomy '{}' landingpage dataset info file at: '{}'"
503+
.format(taxonomy_id, expected_file_name))
466504

467505

468506
def generate_marker_gene_set_template(taxonomy_file_path, output_filepath):
@@ -563,3 +601,15 @@ def merge_class_templates(base_tsv, curation_tsv, output_filepath):
563601
output_filepath: Output file path
564602
"""
565603
merge_tables(base_tsv, curation_tsv, output_filepath)
604+
605+
606+
def get_centralized_taxonomy_folder(taxonomy_config):
607+
"""
608+
Expected folder name is: lower(Species_abbv) + Brain_region_abbv + "_" + Taxonomy_id
609+
Args:
610+
taxonomy_config: taxonomy configuration
611+
612+
Returns: expected centralized data location for the given taxonomy
613+
"""
614+
return str(taxonomy_config['Species_abbv'][0]).lower() + taxonomy_config['Brain_region_abbv'][0] \
615+
+ "_" + taxonomy_config["Taxonomy_id"]

src/scripts/template_generation_utils.py

Lines changed: 4 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -448,20 +448,18 @@ def migrate_manual_curations(source_tsv, target_tsv, migrate_columns, output_fil
448448
writer.writerow(row)
449449

450450

451-
def read_allen_descriptions(path, species):
451+
def read_allen_descriptions(path):
452452
"""
453453
Reads Allen descriptions file from the given location for the given species.
454454
Args:
455455
path: Path to the 'All Descriptions' json file
456-
species: species to read file for
457456
Returns: parsed Allen descriptions json data
458457
"""
459-
allen_descriptions_path = path.format(species)
460-
if os.path.isfile(allen_descriptions_path):
461-
with open(allen_descriptions_path, 'r') as f:
458+
if os.path.isfile(path):
459+
with open(path, 'r') as f:
462460
allen_descriptions = json.loads(f.read())
463461
else:
464-
allen_descriptions = {}
462+
ValueError("Couldn't find allen descriptions file at: '{}'".format(path))
465463
return allen_descriptions
466464

467465

src/scripts/template_runner.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,7 @@
1515
' files. Without optional args, generates an ind '
1616
'template.')
1717
parser_generator.add_argument('-i', '--input', help="Path to input JSON file")
18+
parser_generator.add_argument('-i2', '--input2', help="Path to second input file")
1819
parser_generator.add_argument('-o', '--output', help="Path to output TSV file")
1920
parser_generator.add_argument('-b', '--base', help="List of all class base TSV files")
2021
parser_generator.add_argument('-cb', action='store_true', help="Generate a class base template.")
@@ -59,4 +60,4 @@
5960
elif args.ms:
6061
generate_marker_gene_set_template(args.input, args.output)
6162
else:
62-
generate_ind_template(args.input, args.output)
63+
generate_ind_template(args.input, args.input2, args.output)

0 commit comments

Comments
 (0)