forked from jarny/iscandar
-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathcreate_data_model.py
More file actions
executable file
·105 lines (85 loc) · 4.92 KB
/
create_data_model.py
File metadata and controls
executable file
·105 lines (85 loc) · 4.92 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
"""
Script to create output/js/data-model.js file, using data-model_template.js file as mako template.
Usage (if all input files are ready to go in input/ directory):
> python create_data_model.py
If using from within a python script to create the input variables directly:
> from create_data_model import DataModel
> dm = DataModel()
> dm.metadata = {'name':'pera', ...}
> dm.saveJSFile()
"""
import os, pandas
from mako.template import Template
import time
class DataModel(object):
"""See input directory and dataModelFromInputFiles() function below for examples of data required to be attached to DataModel object.
"""
def saveJSFile(self, templateFile="data-model_template.js", outfile="output/js/data-model.js"):
# overwrite data-model.js by injecting variables
params = {'metadata':self.metadata,
'pca':self.pca.T.values.tolist(),
'tsne':self.tsne.T.values.tolist(),
'analysisMetadata':self.analysisMetadata,
'sampleIds':self.sampleIds,
'sampleGroups':self.sampleGroups,
'sampleGroupItems':self.sampleGroupItems,
'sampleGroupColours':self.sampleGroupColours,
'sampleIdsAsGroupItems':self.sampleIdsAsGroupItems,
'clusters':self.clusters,
'clusterItems':self.clusterItems,
'clusterColours':self.clusterColours,
'sampleIdsAsClusterItems':self.sampleIdsAsClusterItems,
'genesets':self.genesets,
'genesetExpressionValues':self.genesetExpressionValues,
'geneExpressionValues':self.geneExpressionValues,
}
#print(self.sampleIdsAsGroupItems)
template = Template(filename=templateFile)
open(outfile,'w').write(template.render(**params))
def dataModelFromInputFiles(inputDir="input"):
"""Instantiate a DataModel instance and assign all required attributes by reading the files in input directory.
Returns the instance.
"""
dm = DataModel()
dm.metadata = pandas.read_csv(os.path.join(inputDir, "metadata.txt"), sep="\t", index_col=0, header=None).to_dict()[1]
dm.metadata['report creation date'] = time.strftime('%Y-%m-%d', time.localtime())
dm.pca = pandas.read_csv(os.path.join(inputDir, "pca.txt"), sep="\t", index_col=0)
dm.tsne = pandas.read_csv(os.path.join(inputDir, "tsne.txt"), sep="\t", index_col=0)
dm.analysisMetadata = pandas.read_csv(os.path.join(inputDir, "analysisMetadata.txt"), sep="\t", index_col=0, header=None).to_dict()[1]
# process sample info
dm.sampleIds = [str(item) for item in dm.pca.index.tolist()]
samples = pandas.read_csv(os.path.join(inputDir, "samples.txt"), sep="\t", index_col=0)
samples.index = [str(item) for item in samples.index]
samples = samples.loc[dm.sampleIds]
sampleGroupItems = pandas.read_csv(os.path.join(inputDir, "sampleGroupItems.txt"), sep="\t", index_col=0)
dm.sampleGroups = [str(item) for item in sampleGroupItems.index.tolist()]
dm.sampleGroupItems = dict([(str(index), [str(item) for item in row[0].split(',')]) for index,row in sampleGroupItems.iterrows()])
dm.sampleGroupColours = dict([(str(index), dict(zip(row[0].split(','), row[1].split(',')))) for index,row in sampleGroupItems.iterrows()])
dm.sampleIdsAsGroupItems = dict([(item, [str(samples.at[sampleId, item]) for sampleId in dm.sampleIds]) for item in dm.sampleGroups])
# process cluster info
clusters = pandas.read_csv(os.path.join(inputDir, "clusters.txt"), sep="\t", index_col=0)
clusters.index = [str(item) for item in clusters.index]
if len(clusters)>0: clusters = clusters.loc[dm.sampleIds]
clusterItems = pandas.read_csv(os.path.join(inputDir, "clusterItems.txt"), sep="\t", index_col=0)
dm.clusters = [str(item) for item in clusterItems.index.tolist()]
dm.clusterItems = dict([(str(index), [str(item) for item in row[0].split(',')]) for index,row in clusterItems.iterrows()])
dm.clusterColours = dict([(str(index), dict(zip(row[0].split(','), row[1].split(',')))) for index,row in clusterItems.iterrows()])
dm.sampleIdsAsClusterItems = dict([(item, [str(clusters.at[sampleId, item]) for sampleId in dm.sampleIds]) for item in dm.clusters])
# process genesets
genesets = pandas.read_csv(os.path.join(inputDir, "genesets.txt"), sep="\t", index_col=0)
dm.genesets = [{'name':index, 'genes':row['genes'].split(',')} for index,row in genesets.iterrows()]
dm.genesetExpressionValues = dict([(index, list(map(float, row['meanExpression'].split(',')))) for index,row in genesets.iterrows()])
# gene expression values
expression = pandas.read_csv(os.path.join(inputDir, "expression.txt"), sep="\t", index_col=0)
dm.geneExpressionValues = dict([(index, row.tolist()) for index,row in expression.iterrows()])
return dm
if __name__=="__main__":
dataModelFromInputFiles().saveJSFile()
def test_inputFiles():
dm = dataModelFromInputFiles()
assert 'name' in dm.metadata
assert dm.pca.shape==dm.tsne.shape
assert len(dm.sampleGroups)>=1
assert set(dm.sampleGroupItems.keys())==set(dm.sampleGroups)
assert set(dm.clusterItems.keys())==set(dm.clusters)
assert 'name' in dm.genesets[0] and 'genes' in dm.genesets[0]