Skip to content

Commit 290272f

Browse files
authored
Merge pull request #30 from pbashyal-nmdp/make_graph_module
Update README and graph generation
2 parents 7673c13 + 625e921 commit 290272f

File tree

5 files changed

+226
-87
lines changed

5 files changed

+226
-87
lines changed

README.md

Lines changed: 134 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,140 @@
1111

1212
![GRIM Dependencies](images/py-graph-imputation.png)
1313

14+
### Use `py-graph-imputation`
15+
16+
#### Install `py-graph-imputation` from PyPi
17+
```
18+
pip install py-graph-imputation
19+
```
20+
21+
#### Get Frequency Data and Subject Data and Configuration File
22+
23+
For an example, copy the folders to your working directory:
24+
- https://github.com/nmdp-bioinformatics/py-graph-imputation/tree/master/data
25+
- https://github.com/nmdp-bioinformatics/py-graph-imputation/tree/master/conf
26+
27+
so it appears as:
28+
29+
```
30+
.
31+
├── conf
32+
│   ├── README.md
33+
│   └── minimal-configuration.json
34+
├── data
35+
│   ├── freqs
36+
│   │   └── CAU.freqs.gz
37+
│   └── subjects
38+
│   └── donor.csv
39+
```
40+
41+
#### Modify the configuration.json to suit your need
42+
43+
44+
#### Produce HPF csv file from Frequency Data
45+
46+
```
47+
>>> from graph_generation.generate_hpf import produce_hpf
48+
>>> produce_hpf(conf_file='conf/minimal-configuration.json')
49+
****************************************************************************************************
50+
Conversion to HPF file based on following configuration:
51+
Population: ['CAU']
52+
Frequency File Directory: data/freqs
53+
Output File: output/hpf.csv
54+
****************************************************************************************************
55+
Reading Frequency File: data/freqs/CAU.freqs.gz
56+
Writing hpf File: output/hpf.csv
57+
```
58+
59+
This will produce the files which will be used for graph generation:
60+
61+
```
62+
├── output
63+
│   ├── hpf.csv # CSV file of Haplotype, Populatio, Freq
64+
│   └── pop_counts_file.txt # Size of each population
65+
```
66+
67+
#### Generate the Graph (nodes and edges) files
68+
69+
```
70+
>>> from grim.grim import graph_freqs
71+
72+
>>> graph_freqs(conf_file='conf/minimal-configuration.json')
73+
****************************************************************************************************
74+
Performing graph generation based on following configuration:
75+
Population: ['CAU']
76+
Freq File: output/hpf.csv
77+
Freq Trim Threshold: 1e-05
78+
****************************************************************************************************
79+
```
80+
81+
This will produce the following files:
82+
83+
```
84+
├── output
85+
│   ├── csv
86+
│   │   ├── edges.csv
87+
│   │   ├── info_node.csv
88+
│   │   ├── nodes.csv
89+
│   │   └── top_links.csv
90+
91+
```
92+
93+
#### Produce Imputation Results for Subjects
94+
95+
```
96+
>>> from grim.grim import graph_freqs, impute
97+
>>> impute(conf_file='conf/minimal-configuration.json')
98+
****************************************************************************************************
99+
Performing imputation based on:
100+
Population: ['CAU']
101+
Priority: {'alpha': 0.4999999, 'eta': 0, 'beta': 1e-07, 'gamma': 1e-07, 'delta': 0.4999999}
102+
UNK priority: SR
103+
Epsilon: 0.001
104+
Plan B: True
105+
Number of Results: 10
106+
Number of Population Results: 100
107+
Nodes File: output/csv/nodes.csv
108+
Top Links File: output/csv/edges.csv
109+
Input File: data/subjects/donor.csv
110+
Output UMUG Format: True
111+
Output UMUG Freq Filename: output/don.umug
112+
Output UMUG Pops Filename: output/don.umug.pops
113+
Output Haplotype Format: True
114+
Output HAP Freq Filename: output/don.pmug
115+
Output HAP Pops Filename: output/don.pmug.pops
116+
Output Miss Filename: output/don.miss
117+
Output Problem Filename: output/don.problem
118+
Factor Missing Data: 0.0001
119+
Loci Map: {'A': 1, 'B': 2, 'C': 3, 'DQB1': 4, 'DRB1': 5}
120+
Plan B Matrix: [[[1, 2, 3, 4, 5]], [[1, 2, 3], [4, 5]], [[1], [2, 3], [4, 5]], [[1, 2, 3], [4], [5]], [[1], [2, 3], [4], [5]], [[1], [2], [3], [4], [5]]]
121+
Pops Count File: output/pop_counts_file.txt
122+
Use Pops Count File: output/pop_counts_file.txt
123+
Number of Options Threshold: 100000
124+
Max Number of haplotypes in phase: 100
125+
Save space mode: False
126+
****************************************************************************************************
127+
0 Subject: D1 8400 haplotypes
128+
0 Subject: D1 6028 haplotypes
129+
0.09946062499999186
130+
```
131+
132+
This will produce files in `output` directory as:
133+
134+
```
135+
├── output
136+
│   ├── don.miss # Cases that failed imputation (e.g. incorrect typing etc.)
137+
│   ├── don.pmug # Phased imputation as PMUG GL String
138+
│   ├── don.pmug.pops # Population for Phased Imputation
139+
│   ├── don.problem # List of errors
140+
│   ├── don.umug # Unphased imputation as UMUG GL String
141+
│   ├── don.umug.pops # Population for Phased Imputation
142+
```
143+
144+
145+
146+
147+
14148
### Development
15149
How to develop on the project locally.
16150

graph_generation/generate_hpf.py

Lines changed: 89 additions & 84 deletions
Original file line numberDiff line numberDiff line change
@@ -4,89 +4,94 @@
44
import pathlib
55
import argparse
66

7+
8+
def produce_hpf(conf_file):
9+
# Read configuration file and load properties
10+
with open(conf_file) as f:
11+
conf = json.load(f)
12+
13+
pops = conf.get("populations")
14+
freq_data_dir = project_dir + conf.get("freq_data_dir")
15+
output_dir = project_dir + conf.get("graph_files_path")
16+
pop_ratio_dir = project_dir + conf.get("pops_count_file")
17+
# Output in HaplotypePopulationFrequency (hpf) csv file
18+
freq_file = project_dir + conf.get("freq_file")
19+
20+
# Create output directory if it doesn't exist
21+
pathlib.Path(output_dir).mkdir(parents=True, exist_ok=True)
22+
23+
# Display the configurations we are using
24+
print(
25+
"****************************************************************************************************"
26+
)
27+
print("Conversion to HPF file based on following configuration:")
28+
print("\tPopulation: {}".format(pops))
29+
print("\tFrequency File Directory: {}".format(freq_data_dir))
30+
print("\tOutput File: {}".format(freq_file))
31+
print(
32+
"****************************************************************************************************"
33+
)
34+
35+
haplist_overall = {} # list of haplotypes across all populations
36+
pop_hap_combos = {}
37+
38+
list_pop_count = []
39+
#### Load initial frequency files
40+
for pop in pops:
41+
in_freq_file = freq_data_dir + "/" + pop + ".freqs.gz"
42+
print("Reading Frequency File:\t {}".format(in_freq_file))
43+
with gzip.open(in_freq_file, "rb") as zf:
44+
count_pop = 0
45+
lines = [x.decode("utf8").strip() for x in zf.readlines()]
46+
for hap_line in lines:
47+
haplotype, count, freq = hap_line.split(",")
48+
if haplotype == "Haplo":
49+
continue
50+
freq = float(freq)
51+
# Ignore lines with 0 freq
52+
if freq == 0.0:
53+
continue
54+
55+
pop_haplotype = pop + "-" + haplotype
56+
haplist_overall[haplotype] = 1
57+
pop_hap_combos[pop_haplotype] = freq
58+
59+
count_pop += float(count)
60+
list_pop_count.append(count_pop)
61+
62+
sum_pops = sum(list_pop_count)
63+
pop_ratio_file = open(pop_ratio_dir, "w")
64+
for pop, ratio in zip(pops, list_pop_count):
65+
pop_ratio_file.write("{},{},{}\n".format(pop, ratio, (ratio / sum_pops)))
66+
67+
header = ["hap", "pop", "freq"]
68+
69+
print("Writing hpf File:\t {}".format(freq_file))
70+
with open(freq_file, mode="w") as csv_file:
71+
csv_writer = csv.writer(csv_file, delimiter=",", quoting=csv.QUOTE_NONE)
72+
csv_writer.writerow(header)
73+
for pop_haplotype in pop_hap_combos:
74+
(pop, haplotype) = pop_haplotype.split("-")
75+
freq = pop_hap_combos[pop_haplotype]
76+
csv_writer.writerow([haplotype, pop, freq])
77+
78+
79+
# Global
780
project_dir = ""
881

9-
parser = argparse.ArgumentParser()
10-
parser.add_argument(
11-
"-c",
12-
"--config",
13-
required=False,
14-
default="../../conf/minimal-configuration.json",
15-
help="Configuration JSON file",
16-
type=str,
17-
)
18-
19-
args = parser.parse_args()
20-
configuration_file = args.config
21-
22-
# Read configuration file and load properties
23-
with open(configuration_file) as f:
24-
conf = json.load(f)
25-
26-
pops = conf.get("populations")
27-
freq_data_dir = project_dir + conf.get("freq_data_dir")
28-
output_dir = project_dir + conf.get("graph_files_path")
29-
pop_ratio_dir = project_dir + conf.get("pops_count_file")
30-
# Output in HaplotypePopulationFrequency (hpf) csv file
31-
freq_file = project_dir + conf.get("freq_file")
32-
33-
34-
# Create output directory if it doesn't exist
35-
pathlib.Path(output_dir).mkdir(parents=True, exist_ok=True)
36-
37-
# Display the configurations we are using
38-
print(
39-
"****************************************************************************************************"
40-
)
41-
print("Conversion to HPF file based on following configuration:")
42-
print("\tPopulation: {}".format(pops))
43-
print("\tFrequency File Directory: {}".format(freq_data_dir))
44-
print("\tOutput File: {}".format(freq_file))
45-
print(
46-
"****************************************************************************************************"
47-
)
48-
49-
haplist_overall = {} # list of haplotypes across all populations
50-
pop_hap_combos = {}
51-
52-
list_pop_count = []
53-
#### Load initial frequency files
54-
for pop in pops:
55-
in_freq_file = freq_data_dir + "/" + pop + ".freqs.gz"
56-
print("Reading Frequency File:\t {}".format(in_freq_file))
57-
with gzip.open(in_freq_file, "rb") as zf:
58-
count_pop = 0
59-
lines = [x.decode("utf8").strip() for x in zf.readlines()]
60-
for hap_line in lines:
61-
haplotype, count, freq = hap_line.split(",")
62-
if haplotype == "Haplo":
63-
continue
64-
freq = float(freq)
65-
# Ignore lines with 0 freq
66-
if freq == 0.0:
67-
continue
68-
69-
pop_haplotype = pop + "-" + haplotype
70-
haplist_overall[haplotype] = 1
71-
pop_hap_combos[pop_haplotype] = freq
72-
73-
count_pop += float(count)
74-
list_pop_count.append(count_pop)
75-
76-
sum_pops = sum(list_pop_count)
77-
pop_ratio_file = open(pop_ratio_dir, "w")
78-
for pop, ratio in zip(pops, list_pop_count):
79-
pop_ratio_file.write("{},{},{}\n".format(pop, ratio, (ratio / sum_pops)))
80-
81-
82-
header = ["hap", "pop", "freq"]
83-
84-
85-
print("Writing hpf File:\t {}".format(freq_file))
86-
with open(freq_file, mode="w") as csv_file:
87-
csv_writer = csv.writer(csv_file, delimiter=",", quoting=csv.QUOTE_NONE)
88-
csv_writer.writerow(header)
89-
for pop_haplotype in pop_hap_combos:
90-
(pop, haplotype) = pop_haplotype.split("-")
91-
freq = pop_hap_combos[pop_haplotype]
92-
csv_writer.writerow([haplotype, pop, freq])
82+
if __name__ == "__main__":
83+
84+
parser = argparse.ArgumentParser()
85+
parser.add_argument(
86+
"-c",
87+
"--config",
88+
required=False,
89+
default="../../conf/minimal-configuration.json",
90+
help="Configuration JSON file",
91+
type=str,
92+
)
93+
94+
args = parser.parse_args()
95+
configuration_file = args.config
96+
97+
produce_hpf(configuration_file)

grim/__init__.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -26,4 +26,4 @@
2626
"""Top-level package for py-grim."""
2727

2828
__organization__ = "NMDP/CIBMTR Bioinformatics"
29-
__version__ = "0.0.12"
29+
__version__ = "0.0.13"

setup.cfg

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
[bumpversion]
2-
current_version = 0.0.12
2+
current_version = 0.0.13
33
commit = True
44
tag = True
55

setup.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -52,7 +52,7 @@
5252

5353
setup(
5454
name="py-graph-imputation",
55-
version="0.0.12",
55+
version="0.0.13",
5656
author="Pradeep Bashyal",
5757
author_email="[email protected]",
5858
python_requires=">=3.8",

0 commit comments

Comments
 (0)