Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
62 changes: 62 additions & 0 deletions RunGrim.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,62 @@
import json
import os
import pandas as pd

from graph_generation.generate_hpf import produce_hpf
from grim.grim import graph_freqs
from grim.grim import impute
from filter_top_3 import change_donor_file
from filter_by_rest import change_output_by_extra_gl

def remove_empty_rows(file_path):
df = pd.read_csv(file_path)

df_cleaned = df.dropna(how='all')

df_cleaned.to_csv(file_path, index=False)

def run_original_grim(path_configuration,hap_pop_pair=True,Producehpf=False,dominant3=True):
with open(path_configuration,'r') as f:
config = json.load(f)

#first step in py-graph-imputation
if Producehpf :

produce_hpf(conf_file=path_configuration)

path_hpf = config["freq_file"]
#remove empty rows from hpf otherwise doesnt work
remove_empty_rows(path_hpf)

#second step in py-graph-imputation
graph_freqs(conf_file=path_configuration)


#changing donor file to 3 most imporatnt gls and returning short_gl,extra_gl for each row in donor
if dominant3:
path_donor = config["imputation_in_file"]

gls , lines = change_donor_file(path_donor) #change so wont change donor file

#imputation
impute(conf_file=path_configuration,hap_pop_pair= hap_pop_pair)

#change the output and filter by the extra_gl
if dominant3:
path_pmug = os.path.join(config["imputation_out_path"], config["imputation_out_hap_freq_filename"])
path_umug = os.path.join(config["imputation_out_path"], config["imputation_out_umug_freq_filename"])
path_umug_pops = os.path.join(config["imputation_out_path"], config["imputation_out_umug_pops_filename"])
path_pmug_pops = os.path.join(config["imputation_out_path"], config["imputation_out_hap_pops_filename"])
path_miss = os.path.join(config["imputation_out_path"], config["imputation_out_miss_filename"])

change_output_by_extra_gl(config,gls,path_pmug,path_umug,path_umug_pops,path_pmug_pops,path_miss) #filter reasults in our origianl file, add miss to existing miss

# changing to original donor file
with open(path_donor, "w") as file:
for line in lines:
file.write(line)
file.close()

if __name__ == "__main__":
conf_file= "conf/minimal-configuration.json"
run_original_grim(conf_file, True, True,True)
6 changes: 3 additions & 3 deletions conf/minimal-configuration.json
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
"populations": [
"CAU"
],
"freq_trim_threshold": 1e-5,
"freq_trim_threshold": 0.1,
"priority": {
"alpha": 0.4999999,
"eta": 0,
Expand All @@ -29,10 +29,10 @@
[[1], [2, 3], [4], [5]],
[[1], [2], [3], [4], [5]]
],
"planb": true,
"planb": false,
"number_of_options_threshold": 100000,
"epsilon": 1e-3,
"number_of_results": 10,
"number_of_results": 1000,
"number_of_pop_results": 100,
"output_MUUG": true,
"output_haplotypes": true,
Expand Down
303 changes: 303 additions & 0 deletions filter_by_rest.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,303 @@
from collections import defaultdict


def filter_results(res_haps, extra_gl):
"""
Filter the result to the ones consistent with the extra_gl.
res_muugs: dictionary:
{
'MaxProb': 1.2370678464000013e-16,
'Haps': {'A*01:01+A*33:03^B*08:01+B*44:03^C*07:01+C*07:01^DQB1*02:01+DQB1*02:01^DRB1*03:01+DRB1*07:01': 1.5523456571675956e-16},
'Pops': {'CAU,CAU': 1.5523456571675956e-16}
}
res_haps: dictionary:
{
'MaxProb': 1.2370678464000013e-16,
'Haps': [['A*01:01~B*08:01~C*07:01~DQB1*02:01~DRB1*03:01', 'A*33:03~B*44:03~C*07:01~DQB1*02:01~DRB1*07:01'],
['A*33:03~B*08:01~C*07:01~DQB1*02:01~DRB1*03:01', 'A*01:01~B*44:03~C*07:01~DQB1*02:01~DRB1*07:01'],
['A*01:01~B*44:03~C*07:01~DQB1*02:01~DRB1*03:01', 'A*33:03~B*08:01~C*07:01~DQB1*02:01~DRB1*07:01'],
['A*33:03~B*44:03~C*07:01~DQB1*02:01~DRB1*03:01', 'A*01:01~B*08:01~C*07:01~DQB1*02:01~DRB1*07:01']],
'Probs': [1.2370678464000013e-16, 2.990312032960011e-17, 3.0052635931248134e-22, 1.6243602208000046e-18],
'Pops': [['CAU', 'CAU'], ['CAU', 'CAU'], ['CAU', 'CAU'], ['CAU', 'CAU']]
}
Extra GL: C*07:01+C*07:01^DQB1*02:01+DQB1*02:01
Short GL: A*01:01+A*33:03^B*08:01+B*44:03^DRB1*03:01+DRB1*07:01
"""

split_extra_gl_into_locus = extra_gl.split("^")

dct = {locus.split("*")[0]: [set(locus.split("+")[0].split("/")), set(locus.split("+")[1].split("/"))]
for locus in split_extra_gl_into_locus}

haps = res_haps["Haps"]
filter_idx = []
for idx, pair in enumerate(haps):
check = True
hap1, hap2 = pair[0], pair[1]
for allele1, allele2 in zip(hap1.split("~"), hap2.split("~")):
loc = allele1.split("*")[0]
if loc in dct:
if not ((allele1 in dct[loc][0] and allele2 in dct[loc][1]) or (
allele1 in dct[loc][1] and allele2 in dct[loc][0])):
check = False
break
if check:
filter_idx.append(idx)
res_haps["Haps"] = [res_haps["Haps"][idx] for idx in filter_idx]
res_haps["Probs"] = [res_haps["Probs"][idx] for idx in filter_idx]
res_haps["Pops"] = [res_haps["Pops"][idx] for idx in filter_idx]
if not res_haps["Probs"]:
return {"Haps": [], "Probs": [], "Pops": []}

return res_haps


def create_subject_dict(file_path):
subject_dict = {}

# Open and read the file
with open(file_path, 'r') as file:
for line in file:
line = line.strip()
if not line:
continue

subject_id = line.split(',', 1)[0]

if subject_id not in subject_dict:
subject_dict[subject_id] = []

subject_dict[subject_id].append(line)

return subject_dict

def create_haps(path_pmug):
subject_dict = create_subject_dict(path_pmug)
all_haps = {"subject_id": [], "res_haps": []}

for idx, id in enumerate(subject_dict.keys()):
res_haps = {"Haps": [], "Probs": [], "Pops": []}
rows = subject_dict[id]
for row in rows:
row = row.split(',')
pair1 = str(row[1]).split(';')
haps1, pops1 = pair1[0], pair1[1]
pair2 = str(row[2]).split(';')
haps2, pops2 = pair2[0], pair2[1]
prob = float(row[3])

res_haps["Haps"].append([haps1, haps2])
res_haps["Pops"].append([pops1, pops2])
res_haps["Probs"].append(prob)

all_haps["subject_id"].append(id)
all_haps["res_haps"].append(res_haps)

return all_haps

def is_subarray_unordered(large_array, small_array):
# Convert arrays to sets
set_large = set(large_array)
set_small = set(small_array)

# Check if all elements of small_array are in large_array
return set_small.issubset(set_large)

def write_best_hap_race_pairs(name_gl, haps, pops, probs, fout, numOfReasults):
all_res = []

for i in range(len(probs)):
pair = haps[i][0] + ";" + pops[i][0] + "," + haps[i][1] + ";" + pops[i][1]
all_res.append([probs[i], pair])
all_res.sort(key=lambda x: x[0], reverse=True)
# write the output to file
minBestResult = min(numOfReasults,len(all_res))
for k in range(minBestResult):
fout.write(
name_gl
+ ","
+ str(all_res[k][1])
+ ","
+ str(all_res[k][0])
+ ","
+ str(k)
+ "\n"
)
def write_best_prob(name_gl, res, probs, fout,number_of_pop_results ,sign=","):
sumProbsDict = defaultdict(list)
# loop over the result and sum the prob by populations/haplotype
for k in range(len(res)):
key = res[k][0] + sign + res[k][1]
if key in sumProbsDict:
sumProb = probs[k] + sumProbsDict[key]
sumProbsDict[key] = sumProb
else:
key2 = res[k][1] + sign + res[k][0]
if key2 in sumProbsDict:
sumProb = probs[k] + sumProbsDict[key2]
sumProbsDict[key2] = sumProb
else:
sumProbsDict[key] = probs[k]


multProbs = []
for k in sumProbsDict:
multProbs.append([sumProbsDict[k], [k, sumProbsDict[k]]])

multProbs.sort(key=lambda x: x[0], reverse=True)

# write the output to file
minBestResult =min(len(multProbs),number_of_pop_results)
for k in range(minBestResult):
fout.write(
name_gl
+ ","
+ str(multProbs[k][1][0])
+ ","
+ str(multProbs[k][0])
+ ","
+ str(k)
+ "\n"
)

def write_umug(id,res_haps,fout,numOfResults):

res_muugs = {}
for idx ,hap in enumerate(res_haps["Haps"]):
hap1,hap2 = res_haps["Haps"][idx][0], res_haps["Haps"][idx][1]
prob = res_haps["Probs"][idx]
haps = []
haps.append(hap1.split('~'))
haps.append(hap2.split('~'))
muug = ""
for i in range(len(haps[0])):
sort_hap = sorted([haps[0][i], haps[1][i]])
muug += sort_hap[0] + "+" + sort_hap[1] + "^"
muug = muug[:-1]
if muug in res_muugs.keys():
res_muugs[muug] += prob
else:
res_muugs[muug] = prob
pairs = []
for key in res_muugs.keys():
pairs.append((key, res_muugs[key]))
pairs = sorted(pairs, key=lambda x: x[1], reverse=True)
minResults = min(numOfResults,len(pairs))
for k in range(minResults):
fout.write(
id
+ ","
+ str(pairs[k][0])
+ ","
+ str(pairs[k][1])
+ ","
+ str(k)
+ "\n"
)

def write_umug_pops(id,res_haps,fout,numOfResults):
res_muugs = {}
for idx,pop in enumerate(res_haps["Haps"]):
pop1,pop2 = res_haps["Pops"][idx][0], res_haps["Pops"][idx][1]
prob = res_haps["Probs"][idx]
pops = [pop1,pop2]
pops = sorted(pops)
muug = pops[0]+','+pops[1]
if muug in res_muugs.keys():
res_muugs[muug] += prob
else:
res_muugs[muug] = prob
pairs = []
for key in res_muugs.keys():
pairs.append((key, res_muugs[key]))
pairs = sorted(pairs, key=lambda x: x[1], reverse=True)
minResults = min(numOfResults,len(pairs))
for k in range(minResults):
fout.write(
id
+ ","
+ str(pairs[k][0])
+ ","
+ str(pairs[k][1])
+ ","
+ str(k)
+ "\n"
)

def write_filter(subject_id,res_haps,fout_hap_haplo,fout_pop_haplo,fout_hap_muug,fout_pop_muug,number_of_results,number_of_pop_results,MUUG_output,haps_output):
haps = res_haps["Haps"]
probs = res_haps["Probs"]
pops = res_haps["Pops"]
if haps_output:
write_best_hap_race_pairs(
subject_id,
haps,
pops,
probs,
fout_hap_haplo,
number_of_results
)
write_best_prob(subject_id, pops, probs, fout_pop_haplo,1)
if MUUG_output:
write_umug(subject_id,res_haps,fout_hap_muug,number_of_results)
write_umug_pops(subject_id,res_haps,fout_pop_muug,number_of_pop_results)


def change_output_by_extra_gl(config,gls,path_pmug,path_umug,path_umug_pops,path_pmug_pops,path_miss):
res_haps = create_haps(path_pmug)
all_data = {"subject_id": [], "res_haps": [], "extra_gl": [], "short_gl": []}

if is_subarray_unordered(gls["subject_id"],res_haps["subject_id"]):
ids= []
haps = []
extras = []
shorts = []
for idx,id in enumerate(res_haps["subject_id"]):
ids.append(id)
haps.append(res_haps["res_haps"][idx])
gl_idx = gls["subject_id"].index(id)
extras.append(gls["extra_gl"][gl_idx])
shorts.append(gls["short_gl"][gl_idx])
all_data["subject_id"] = ids
all_data["res_haps"] = haps
all_data["extra_gl"] = extras
all_data["short_gl"] = shorts
else:
print("error we got umug has ids that are not form the gls")

MUUG_output = config["output_MUUG"]
haps_output = config["output_haplotypes"]
number_of_results = config["number_of_results"]
number_of_pop_results = config["number_of_pop_results"]

fout_hap_haplo,fout_pop_haplo,fout_hap_muug,fout_pop_muug ="","","",""

if haps_output:
fout_hap_haplo = open(path_pmug, "w")
fout_pop_haplo = open(path_pmug_pops,"w")
if MUUG_output:
fout_hap_muug = open(path_umug,"w")
fout_pop_muug = open(path_umug_pops,"w")
miss = open(path_miss,"a")

for idx,id in enumerate(all_data["subject_id"]):
subject_id = id
res_haps = all_data["res_haps"][idx]
extra_gl = all_data["extra_gl"][idx]

if len(extra_gl) > 0:
res_haps = filter_results(res_haps, extra_gl)

if len(res_haps["Haps"]) == 0 :
gl_idx = gls["subject_id"].index(subject_id)
miss.write(str(gl_idx) + "," + str(subject_id) + "\n")
else:
write_filter(subject_id, res_haps, fout_hap_haplo, fout_pop_haplo, fout_hap_muug, fout_pop_muug,number_of_results,number_of_pop_results,MUUG_output,haps_output)

if MUUG_output:
fout_hap_muug.close()
fout_pop_muug.close()
if haps_output:
fout_hap_haplo.close()
fout_pop_haplo.close()
miss.close()
Loading
Loading