|
4 | 4 | import pathlib |
5 | 5 | import argparse |
6 | 6 |
|
| 7 | + |
| 8 | +def produce_hpf(conf_file): |
| 9 | + # Read configuration file and load properties |
| 10 | + with open(conf_file) as f: |
| 11 | + conf = json.load(f) |
| 12 | + |
| 13 | + pops = conf.get("populations") |
| 14 | + freq_data_dir = project_dir + conf.get("freq_data_dir") |
| 15 | + output_dir = project_dir + conf.get("graph_files_path") |
| 16 | + pop_ratio_dir = project_dir + conf.get("pops_count_file") |
| 17 | + # Output in HaplotypePopulationFrequency (hpf) csv file |
| 18 | + freq_file = project_dir + conf.get("freq_file") |
| 19 | + |
| 20 | + # Create output directory if it doesn't exist |
| 21 | + pathlib.Path(output_dir).mkdir(parents=True, exist_ok=True) |
| 22 | + |
| 23 | + # Display the configurations we are using |
| 24 | + print( |
| 25 | + "****************************************************************************************************" |
| 26 | + ) |
| 27 | + print("Conversion to HPF file based on following configuration:") |
| 28 | + print("\tPopulation: {}".format(pops)) |
| 29 | + print("\tFrequency File Directory: {}".format(freq_data_dir)) |
| 30 | + print("\tOutput File: {}".format(freq_file)) |
| 31 | + print( |
| 32 | + "****************************************************************************************************" |
| 33 | + ) |
| 34 | + |
| 35 | + haplist_overall = {} # list of haplotypes across all populations |
| 36 | + pop_hap_combos = {} |
| 37 | + |
| 38 | + list_pop_count = [] |
| 39 | + #### Load initial frequency files |
| 40 | + for pop in pops: |
| 41 | + in_freq_file = freq_data_dir + "/" + pop + ".freqs.gz" |
| 42 | + print("Reading Frequency File:\t {}".format(in_freq_file)) |
| 43 | + with gzip.open(in_freq_file, "rb") as zf: |
| 44 | + count_pop = 0 |
| 45 | + lines = [x.decode("utf8").strip() for x in zf.readlines()] |
| 46 | + for hap_line in lines: |
| 47 | + haplotype, count, freq = hap_line.split(",") |
| 48 | + if haplotype == "Haplo": |
| 49 | + continue |
| 50 | + freq = float(freq) |
| 51 | + # Ignore lines with 0 freq |
| 52 | + if freq == 0.0: |
| 53 | + continue |
| 54 | + |
| 55 | + pop_haplotype = pop + "-" + haplotype |
| 56 | + haplist_overall[haplotype] = 1 |
| 57 | + pop_hap_combos[pop_haplotype] = freq |
| 58 | + |
| 59 | + count_pop += float(count) |
| 60 | + list_pop_count.append(count_pop) |
| 61 | + |
| 62 | + sum_pops = sum(list_pop_count) |
| 63 | + pop_ratio_file = open(pop_ratio_dir, "w") |
| 64 | + for pop, ratio in zip(pops, list_pop_count): |
| 65 | + pop_ratio_file.write("{},{},{}\n".format(pop, ratio, (ratio / sum_pops))) |
| 66 | + |
| 67 | + header = ["hap", "pop", "freq"] |
| 68 | + |
| 69 | + print("Writing hpf File:\t {}".format(freq_file)) |
| 70 | + with open(freq_file, mode="w") as csv_file: |
| 71 | + csv_writer = csv.writer(csv_file, delimiter=",", quoting=csv.QUOTE_NONE) |
| 72 | + csv_writer.writerow(header) |
| 73 | + for pop_haplotype in pop_hap_combos: |
| 74 | + (pop, haplotype) = pop_haplotype.split("-") |
| 75 | + freq = pop_hap_combos[pop_haplotype] |
| 76 | + csv_writer.writerow([haplotype, pop, freq]) |
| 77 | + |
| 78 | + |
| 79 | +# Global |
7 | 80 | project_dir = "" |
8 | 81 |
|
9 | | -parser = argparse.ArgumentParser() |
10 | | -parser.add_argument( |
11 | | - "-c", |
12 | | - "--config", |
13 | | - required=False, |
14 | | - default="../../conf/minimal-configuration.json", |
15 | | - help="Configuration JSON file", |
16 | | - type=str, |
17 | | -) |
18 | | - |
19 | | -args = parser.parse_args() |
20 | | -configuration_file = args.config |
21 | | - |
22 | | -# Read configuration file and load properties |
23 | | -with open(configuration_file) as f: |
24 | | - conf = json.load(f) |
25 | | - |
26 | | -pops = conf.get("populations") |
27 | | -freq_data_dir = project_dir + conf.get("freq_data_dir") |
28 | | -output_dir = project_dir + conf.get("graph_files_path") |
29 | | -pop_ratio_dir = project_dir + conf.get("pops_count_file") |
30 | | -# Output in HaplotypePopulationFrequency (hpf) csv file |
31 | | -freq_file = project_dir + conf.get("freq_file") |
32 | | - |
33 | | - |
34 | | -# Create output directory if it doesn't exist |
35 | | -pathlib.Path(output_dir).mkdir(parents=True, exist_ok=True) |
36 | | - |
37 | | -# Display the configurations we are using |
38 | | -print( |
39 | | - "****************************************************************************************************" |
40 | | -) |
41 | | -print("Conversion to HPF file based on following configuration:") |
42 | | -print("\tPopulation: {}".format(pops)) |
43 | | -print("\tFrequency File Directory: {}".format(freq_data_dir)) |
44 | | -print("\tOutput File: {}".format(freq_file)) |
45 | | -print( |
46 | | - "****************************************************************************************************" |
47 | | -) |
48 | | - |
49 | | -haplist_overall = {} # list of haplotypes across all populations |
50 | | -pop_hap_combos = {} |
51 | | - |
52 | | -list_pop_count = [] |
53 | | -#### Load initial frequency files |
54 | | -for pop in pops: |
55 | | - in_freq_file = freq_data_dir + "/" + pop + ".freqs.gz" |
56 | | - print("Reading Frequency File:\t {}".format(in_freq_file)) |
57 | | - with gzip.open(in_freq_file, "rb") as zf: |
58 | | - count_pop = 0 |
59 | | - lines = [x.decode("utf8").strip() for x in zf.readlines()] |
60 | | - for hap_line in lines: |
61 | | - haplotype, count, freq = hap_line.split(",") |
62 | | - if haplotype == "Haplo": |
63 | | - continue |
64 | | - freq = float(freq) |
65 | | - # Ignore lines with 0 freq |
66 | | - if freq == 0.0: |
67 | | - continue |
68 | | - |
69 | | - pop_haplotype = pop + "-" + haplotype |
70 | | - haplist_overall[haplotype] = 1 |
71 | | - pop_hap_combos[pop_haplotype] = freq |
72 | | - |
73 | | - count_pop += float(count) |
74 | | - list_pop_count.append(count_pop) |
75 | | - |
76 | | -sum_pops = sum(list_pop_count) |
77 | | -pop_ratio_file = open(pop_ratio_dir, "w") |
78 | | -for pop, ratio in zip(pops, list_pop_count): |
79 | | - pop_ratio_file.write("{},{},{}\n".format(pop, ratio, (ratio / sum_pops))) |
80 | | - |
81 | | - |
82 | | -header = ["hap", "pop", "freq"] |
83 | | - |
84 | | - |
85 | | -print("Writing hpf File:\t {}".format(freq_file)) |
86 | | -with open(freq_file, mode="w") as csv_file: |
87 | | - csv_writer = csv.writer(csv_file, delimiter=",", quoting=csv.QUOTE_NONE) |
88 | | - csv_writer.writerow(header) |
89 | | - for pop_haplotype in pop_hap_combos: |
90 | | - (pop, haplotype) = pop_haplotype.split("-") |
91 | | - freq = pop_hap_combos[pop_haplotype] |
92 | | - csv_writer.writerow([haplotype, pop, freq]) |
| 82 | +if __name__ == "__main__": |
| 83 | + |
| 84 | + parser = argparse.ArgumentParser() |
| 85 | + parser.add_argument( |
| 86 | + "-c", |
| 87 | + "--config", |
| 88 | + required=False, |
| 89 | + default="../../conf/minimal-configuration.json", |
| 90 | + help="Configuration JSON file", |
| 91 | + type=str, |
| 92 | + ) |
| 93 | + |
| 94 | + args = parser.parse_args() |
| 95 | + configuration_file = args.config |
| 96 | + |
| 97 | + produce_hpf(configuration_file) |
0 commit comments