Skip to content

Commit 6b5db7d

Browse files
committed
Merge branch 'sapiris-master'
2 parents a08fd16 + 289f912 commit 6b5db7d

File tree

9 files changed

+236
-32
lines changed

9 files changed

+236
-32
lines changed

MANIFEST.in

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -8,3 +8,7 @@ recursive-include grim *.txt
88
recursive-include grim *.json
99
recursive-include grim *.pyx
1010
recursive-include grim *.pyd
11+
recursive-include conf *.json
12+
recursive-include data *.csv
13+
recursive-include graph_generation *.csv
14+
recursive-include graph_generation *.py
Lines changed: 55 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,55 @@
1+
{
2+
"populations": [
3+
"CAU"
4+
],
5+
"freq_trim_threshold": 1e-5,
6+
"priority": {
7+
"alpha": 0.4999999,
8+
"eta": 0,
9+
"beta": 1e-7,
10+
"gamma": 1e-7,
11+
"delta": 0.4999999
12+
},
13+
"UNK_priors": "SR",
14+
"FULL_LOCI": "ABCQR",
15+
"loci_map": {
16+
"A": 1,
17+
"B": 2,
18+
"C": 3,
19+
"DQB1": 4,
20+
"DRB1": 5
21+
},
22+
23+
"factor_missing_data": 0.0001,
24+
"Plan_B_Matrix": [
25+
[[1, 2, 3, 4, 5]],
26+
[[1, 2, 3], [4, 5]],
27+
[[1], [2, 3], [4, 5]],
28+
[[1, 2, 3], [4], [5]],
29+
[[1], [2, 3], [4], [5]],
30+
[[1], [2], [3], [4], [5]]
31+
],
32+
"planb": true,
33+
"number_of_options_threshold": 100000,
34+
"epsilon": 1e-3,
35+
"number_of_results": 10,
36+
"number_of_pop_results": 100,
37+
"output_MUUG": true,
38+
"output_haplotypes": true,
39+
"freq_data_dir": "data/freqs" ,
40+
"freq_file": "output/hpf.csv" ,
41+
"graph_files_path": "graph_generation/output/csv/" ,
42+
"node_csv_file": "nodes.csv",
43+
"edges_csv_file": "edges.csv",
44+
"info_node_csv_file": "info_node.csv",
45+
"top_links_csv_file": "top_links.csv",
46+
"imputation_in_file": "data/subjects/donor.csv",
47+
"imputation_out_umug_freq_filename": "don.umug",
48+
"imputation_out_umug_pops_filename": "don.umug.pops",
49+
"imputation_out_hap_freq_filename": "don.pmug",
50+
"imputation_out_hap_pops_filename": "don.pmug.pops",
51+
"imputation_out_miss_filename": "don.miss",
52+
"imputation_out_problem_filename": "don.problem",
53+
"max_haplotypes_number_in_phase": 100,
54+
"imuptation_out_path": "output"
55+
}

conf/minimal-configuration.json

Lines changed: 2 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -37,9 +37,8 @@
3737
"output_MUUG": true,
3838
"output_haplotypes": true,
3939
"freq_data_dir": "data/freqs" ,
40-
"pops_count_file": "graph_generation/output/pop_ratio.txt" ,
41-
"freq_file": "graph_generation/output/hpf.csv" ,
42-
"graph_files_path": "graph_generation/output/csv/" ,
40+
"freq_file": "output/hpf.csv" ,
41+
"graph_files_path": "output/csv/" ,
4342
"node_csv_file": "nodes.csv",
4443
"edges_csv_file": "edges.csv",
4544
"info_node_csv_file": "info_node.csv",

graph_generation/generate_neo4j_multi_hpf.py

Lines changed: 6 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -205,7 +205,7 @@ def loci_order(loc_values):
205205

206206

207207
def generate_graph(
208-
config_file="../../conf/minimal-configuration.json", em_pop=None, em=False
208+
config_file="../conf/minimal-configuration-script.json", em_pop=None, em=False, use_default_path = False
209209
):
210210
##############################################################################
211211
# Configure
@@ -215,7 +215,9 @@ def generate_graph(
215215

216216
# Input file
217217
# freq_file = path + freq_file
218-
218+
path = ""
219+
if use_default_path:
220+
path = os.path.dirname(os.path.realpath(__file__)) + "/"
219221
parser = argparse.ArgumentParser()
220222
parser.add_argument(
221223
"-c",
@@ -243,10 +245,10 @@ def generate_graph(
243245
pops = em_pop
244246
freq_trim = conf.get("freq_trim_threshold")
245247

246-
freq_file = conf.get("freq_file")
248+
freq_file = path + conf.get("freq_file")
247249
dict_count_of_pop = {}
248250

249-
pop_ratio_dir = conf.get("pops_count_file")
251+
pop_ratio_dir = path + conf.get("pops_count_file", "")
250252
path = pathlib.Path(pop_ratio_dir)
251253

252254
if em or not path.is_file():

grim/grim.py

Lines changed: 23 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -23,39 +23,53 @@
2323
#
2424

2525

26-
from .validation import runfile
27-
from .imputation.graph_generation import generate_neo4j_multi_hpf
26+
27+
2828

2929
from .imputation.impute import Imputation
3030
from .imputation.networkx_graph import Graph
31+
32+
import sys
3133
import os
3234

35+
# adding Folder_2 to the system path
36+
sys.path.insert(0, os.path.dirname(os.path.realpath(__file__)).replace("/grim", ""))
37+
38+
39+
from graph_generation import generate_neo4j_multi_hpf
40+
from grim.run_impute_def import run_impute
41+
42+
3343

3444
def graph_freqs(conf_file="", for_em=False, em_pop=None):
45+
use_default_path = False
3546
if conf_file == "":
47+
use_default_path = True
3648
conf_file = (
37-
os.path.dirname(os.path.realpath(__file__))
49+
os.path.dirname(os.path.realpath(__file__)).replace("/grim", "")
3850
+ "/conf/minimal-configuration.json"
3951
)
4052

4153
generate_neo4j_multi_hpf.generate_graph(
42-
config_file=conf_file, em_pop=em_pop, em=for_em
54+
config_file=conf_file, em_pop=em_pop, em=for_em, use_default_path = use_default_path
4355
)
4456

4557

4658
def impute(conf_file=""):
59+
4760
project_dir_in_file, project_dir_graph = "", ""
4861
if conf_file == "":
62+
4963
conf_file = (
50-
os.path.dirname(os.path.realpath(__file__))
64+
os.path.dirname(os.path.realpath(__file__)).replace("/grim", "")
5165
+ "/conf/minimal-configuration.json"
5266
)
5367
project_dir_graph = (
54-
os.path.dirname(os.path.realpath(__file__))
55-
+ "/imputation/graph_generation/"
68+
os.path.dirname(os.path.realpath(__file__)).replace("/grim", "")
69+
+ "/graph_generation/"
5670
)
57-
project_dir_in_file = os.path.dirname(os.path.realpath(__file__)) + "/"
58-
runfile.run_impute(conf_file, project_dir_graph, project_dir_in_file)
71+
project_dir_in_file = os.path.dirname(os.path.realpath(__file__)).replace("/grim", "") + "/"
72+
run_impute(conf_file, project_dir_graph, project_dir_in_file)
5973

6074

6175
def impute_instance(config, graph, count_by_prob=None):

grim/run_impute_def.py

Lines changed: 127 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,127 @@
1+
import argparse
2+
import cProfile
3+
import json
4+
import pathlib
5+
import sys
6+
import os
7+
8+
sys.path.insert(0, os.path.dirname(os.path.realpath(__file__)))
9+
10+
from .imputation.impute import Imputation
11+
from .imputation.networkx_graph import Graph
12+
13+
# Profiler start
14+
#pr = cProfile.Profile()
15+
#pr.enable()
16+
17+
def run_impute(conf_file = "../conf/minimal-configuration.json", project_dir_graph = "", project_dir_in_file = ""):
18+
19+
configuration_file = conf_file
20+
21+
#project_dir = ""# "../"
22+
#output_dir = "output/"
23+
24+
25+
# Read configuration file and load properties
26+
with open(configuration_file) as f:
27+
json_conf = json.load(f)
28+
29+
graph_files_path = json_conf.get("graph_files_path")
30+
if graph_files_path[-1] != '/':
31+
graph_files_path += '/'
32+
output_dir = json_conf.get("imuptation_out_path", "output")
33+
if output_dir[-1] != '/':
34+
output_dir += '/'
35+
config = {
36+
"planb": json_conf.get('planb', True),
37+
"pops": json_conf.get('populations'),
38+
"priority": json_conf.get('priority'),
39+
"epsilon": json_conf.get('epsilon', 1e-3),
40+
"number_of_results": json_conf.get('number_of_results', 1000),
41+
"number_of_pop_results": json_conf.get('number_of_pop_results', 100),
42+
"output_MUUG": json_conf.get("output_MUUG", True),
43+
"output_haplotypes": json_conf.get("output_haplotypes", False),
44+
"node_file": project_dir_graph + graph_files_path + json_conf.get("node_csv_file"),
45+
"top_links_file": project_dir_graph + graph_files_path + json_conf.get("top_links_csv_file"),
46+
"edges_file": project_dir_graph + graph_files_path +json_conf.get("edges_csv_file"),
47+
"imputation_input_file": project_dir_in_file + json_conf.get("imputation_in_file"),
48+
"imputation_out_umug_freq_file": output_dir + json_conf.get("imputation_out_umug_freq_filename"),
49+
"imputation_out_umug_pops_file": output_dir + json_conf.get("imputation_out_umug_pops_filename"),
50+
"imputation_out_hap_freq_file": output_dir + json_conf.get("imputation_out_hap_freq_filename"),
51+
"imputation_out_hap_pops_file": output_dir + json_conf.get("imputation_out_hap_pops_filename"),
52+
"imputation_out_miss_file": output_dir + json_conf.get("imputation_out_miss_filename"),
53+
"imputation_out_problem_file": output_dir + json_conf.get("imputation_out_problem_filename"),
54+
"factor_missing_data": json_conf.get("factor_missing_data", 0.01),
55+
"loci_map": json_conf.get("loci_map", {"A": 1, "B":3, "C": 2, "DQB1": 4, "DRB1": 5} ),
56+
"matrix_planb": json_conf.get("Plan_B_Matrix", [
57+
[[1, 2, 3, 4, 5]],
58+
[[1, 2, 3], [4, 5]],
59+
[[1], [2, 3], [4, 5]],
60+
[[1, 2, 3], [4], [5]],
61+
[[1], [2, 3], [4], [5]],
62+
[[1], [2], [3], [4], [5]]
63+
]),
64+
"pops_count_file": project_dir_graph + json_conf.get("pops_count_file",'' ),
65+
"use_pops_count_file": json_conf.get("pops_count_file",False),
66+
"number_of_options_threshold": json_conf.get("number_of_options_threshold", 100000),
67+
"max_haplotypes_number_in_phase": json_conf.get("max_haplotypes_number_in_phase",100 ),
68+
"bin_imputation_input_file": project_dir_in_file + json_conf.get("bin_imputation_in_file", "None"),
69+
"nodes_for_plan_A": json_conf.get("Plan_A_Matrix", []),
70+
"save_mode": json_conf.get("save_space_mode", False),
71+
"UNK_priors" : json_conf.get("UNK_priors", "MR")
72+
73+
}
74+
75+
# Display the configurations we are using
76+
print('****************************************************************************************************')
77+
print("Performing imputation based on:")
78+
print("\tPopulation: {}".format(config["pops"]))
79+
print("\tPriority: {}".format(config["priority"]))
80+
print("\tUNK priority: {}".format(config["UNK_priors"]))
81+
print("\tEpsilon: {}".format(config["epsilon"]))
82+
print("\tPlan B: {}".format(config["planb"]))
83+
print("\tNumber of Results: {}".format(config["number_of_results"]))
84+
print("\tNumber of Population Results: {}".format(config["number_of_pop_results"]))
85+
print("\tNodes File: {}".format(config["node_file"]))
86+
print("\tTop Links File: {}".format(config["edges_file"]))
87+
print("\tInput File: {}".format(config["imputation_input_file"]))
88+
print("\tOutput UMUG Format: {}".format(config["output_MUUG"]))
89+
print("\tOutput UMUG Freq Filename: {}".format(config["imputation_out_umug_freq_file"]))
90+
print("\tOutput UMUG Pops Filename: {}".format(config["imputation_out_umug_pops_file"]))
91+
print("\tOutput Haplotype Format: {}".format(config["output_haplotypes"]))
92+
print("\tOutput HAP Freq Filename: {}".format(config["imputation_out_hap_freq_file"]))
93+
print("\tOutput HAP Pops Filename: {}".format(config["imputation_out_hap_pops_file"]))
94+
print("\tOutput Miss Filename: {}".format(config["imputation_out_miss_file"]))
95+
print("\tOutput Problem Filename: {}".format(config["imputation_out_problem_file"]))
96+
print("\tFactor Missing Data: {}".format(config["factor_missing_data"]))
97+
print("\tLoci Map: {}".format(config["loci_map"]))
98+
print("\tPlan B Matrix: {}".format(config["matrix_planb"]))
99+
print("\tPops Count File: {}".format(config["pops_count_file"]))
100+
print("\tUse Pops Count File: {}".format(config["use_pops_count_file"]))
101+
print("\tNumber of Options Threshold: {}".format(config["number_of_options_threshold"]))
102+
print("\tMax Number of haplotypes in phase: {}".format(config["max_haplotypes_number_in_phase"]))
103+
if config["nodes_for_plan_A"]:
104+
print("\tNodes in plan A: {}".format(config["nodes_for_plan_A"]))
105+
print("\tSave space mode: {}".format(config["save_mode"]))
106+
print('****************************************************************************************************')
107+
108+
109+
all_loci_set = set()
110+
for _, val in config["loci_map"].items():
111+
all_loci_set.add(str(val))
112+
113+
config["full_loci"] = ''.join(sorted(all_loci_set))
114+
# Perform imputation
115+
graph = Graph(config)
116+
graph.build_graph(config["node_file"], config["top_links_file"], config["edges_file"])
117+
imputation = Imputation(graph, config)
118+
119+
# Create output directory if it doesn't exist
120+
pathlib.Path(output_dir).mkdir(parents=False, exist_ok=True)
121+
122+
# Write out the results from imputation
123+
imputation.impute_file(config)
124+
125+
# Profiler end
126+
#pr.disable()
127+
#pr.print_stats(sort="time")

scripts/parallel-imputation.py

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -8,10 +8,10 @@
88
import pathlib
99
from multiprocessing.pool import Pool
1010

11-
from imputegl import Imputation
11+
from grim.imputation.impute import Imputation
1212
from imputegl.impute import write_best_prob, write_best_prob_genotype
1313

14-
from imputegl.networkx_graph import Graph
14+
from grim.imputation.networkx_graph import Graph
1515

1616
# Profiler start
1717
pr = cProfile.Profile()
@@ -22,7 +22,7 @@
2222
"-c",
2323
"--config",
2424
required=False,
25-
default="../minimal-configuration.json",
25+
default="../minimal-configuration-script.json",
2626
help="Configuration JSON file",
2727
type=str,
2828
)
@@ -56,9 +56,9 @@
5656
"number_of_pop_results": json_conf.get("number_of_pop_results", 100),
5757
"output_MUUG": json_conf.get("output_MUUG", True),
5858
"output_haplotypes": json_conf.get("output_haplotypes", False),
59-
"node_file": project_dir + json_conf.get("node_csv_file"),
60-
"top_links_file": project_dir + json_conf.get("top_links_csv_file"),
61-
"edges_file": project_dir + json_conf.get("edges_csv_file"),
59+
"node_file": project_dir + json_conf.get("graph_files_path") + json_conf.get("node_csv_file"),
60+
"top_links_file": project_dir + json_conf.get("graph_files_path") + json_conf.get("top_links_csv_file"),
61+
"edges_file": project_dir + json_conf.get("graph_files_path") + json_conf.get("edges_csv_file"),
6262
"imputation_input_file": project_dir + json_conf.get("imputation_in_file"),
6363
"imputation_out_umug_freq_file": output_dir
6464
+ json_conf.get("imputation_out_umug_freq_filename"),

scripts/runfile.py

Lines changed: 8 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,8 @@
66
import sys
77
import os
88

9+
sys.path.insert(0, os.path.join(".."))
10+
911
from grim.imputation.impute import Imputation
1012
from grim.imputation.networkx_graph import Graph
1113

@@ -18,7 +20,7 @@
1820
"-c",
1921
"--config",
2022
required=False,
21-
default="../conf/minimal-configuration.json",
23+
default="../conf/minimal-configuration-script.json",
2224
help="Configuration JSON file",
2325
type=str,
2426
)
@@ -28,6 +30,7 @@
2830

2931
# read the config file
3032
output_dir = "output/"
33+
project_dir = "../"
3134

3235
# Read configuration file and load properties
3336
with open(configuration_file) as f:
@@ -42,11 +45,10 @@
4245
"number_of_pop_results": json_conf.get("number_of_pop_results", 100),
4346
"output_MUUG": json_conf.get("output_MUUG", True),
4447
"output_haplotypes": json_conf.get("output_haplotypes", False),
45-
"node_file": json_conf.get("graph_files_path") + json_conf.get("node_csv_file"),
46-
"top_links_file": json_conf.get("graph_files_path")
47-
+ json_conf.get("top_links_csv_file"),
48-
"edges_file": json_conf.get("graph_files_path") + json_conf.get("edges_csv_file"),
49-
"imputation_input_file": json_conf.get("imputation_in_file"),
48+
"node_file": project_dir + json_conf.get("graph_files_path") + json_conf.get("node_csv_file"),
49+
"top_links_file": project_dir + json_conf.get("graph_files_path") + json_conf.get("top_links_csv_file"),
50+
"edges_file": project_dir + json_conf.get("graph_files_path") + json_conf.get("edges_csv_file"),
51+
"imputation_input_file": project_dir + json_conf.get("imputation_in_file"),
5052
"imputation_out_umug_freq_file": output_dir
5153
+ json_conf.get("imputation_out_umug_freq_filename"),
5254
"imputation_out_umug_pops_file": output_dir

setup.py

Lines changed: 5 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -77,10 +77,11 @@
7777
include=[
7878
"grim",
7979
"grim.imputation",
80-
"grim.imputation.imputegl",
81-
"grim.imputation.graph_generation",
82-
"grim.validation",
83-
"grim.conf",
80+
"graph_generation",
81+
"graph_generation.output",
82+
"data",
83+
"data.subjects",
84+
"conf"
8485
]
8586
),
8687
test_suite="tests",

0 commit comments

Comments
 (0)