code for flattening glstrings

mmaiers-nmdp · mmaiers-nmdp · commit 7ec954fc221a · 2019-01-21T12:13:27.000-06:00
diff --git a/pyard/flatten_glstring.py b/pyard/flatten_glstring.py
@@ -0,0 +1,38 @@
+from glstring import flatten
+import argparse
+
+
+def main():
+    parser = argparse.ArgumentParser()
+    parser.add_argument("-i", "--infile",
+                        required=True,
+                        help="input file",
+                        type=str)
+
+    parser.add_argument("-o", "--outfile",
+                        required=True,
+                        help="output file",
+                        type=str)
+    args = parser.parse_args()
+
+    infile = args.infile
+    outfile = args.outfile
+    print("reading from ", infile)
+    print("writing to ", outfile)
+
+    fin = open(infile, 'r')
+    fout = open(outfile, 'w')
+
+    with fin as lines:
+        for line in lines:
+            (id, gl) = line.rstrip().split('%')
+            fgl = flatten(gl)
+            fout.write('%'.join([id, fgl]) + '\n')
+    fin.close()
+    fout.close()
+
+
+if __name__ == '__main__':
+    """The following will be run if file is executed directly,
+    but not if imported as a module"""
+    main()
diff --git a/pyard/flatten_glstring.sh b/pyard/flatten_glstring.sh
@@ -0,0 +1,14 @@
+#!/bin/bash
+INDIR=/vol/bio/wmda_simulator/graph/PlanA
+OUTDIR=/vol/bio/wmda_simulator/graph/PlanA/flatgl
+for pop in AAFA_CARB AAFA_NAMER FILII_NAMER MENAFC_NAMER
+  do
+  for popcat in donor patient
+  do
+    INFILE=${INDIR}/${pop}_GraphVal_PlanA_${popcat}.in
+    OUTFILE=${OUTDIR}/${pop}_GraphVal_PlanA_${popcat}.flat.gl
+    python flatten_glstring.py -i ${INFILE} -o ${OUTFILE} 
+  done
+done 
+
+
diff --git a/pyard/glstring.py b/pyard/glstring.py
@@ -0,0 +1,42 @@
+# glstring
+# module for working with glstrings
+
+import re
+
+
+# convert genotype ambiguity into allele ambiguity
+
+# TODO: need to handle "^" character
+
+def flatten (gls):
+    # if gls contains ^
+    if re.search("\^", gls):
+        # loop over all loci
+        return "^".join(flatten_loc(g) for g in gls.split("^"))
+    else:
+        return flatten_loc(gls)
+        
+
+def flatten_loc (gls):
+    # if gls contains |
+    if re.search("\|", gls):
+        # loop over all genos
+        typ1 = dict()
+        typ2 = dict()
+        for geno in gls.split("|"):
+            # split on +
+            if not re.search("\+", geno):
+                print("geno ", geno, " has no +")
+            t1, t2 = geno.split("+")
+            # add to hash1, hash2
+            typ1[t1]=1
+            typ2[t2]=1
+
+        # join keys by /
+        newt1 = "/".join(sorted(typ1.keys()))
+        newt2 = "/".join(sorted(typ2.keys()))
+        # join these by +
+        newgeno = "+".join([newt1, newt2])
+        return(newgeno)
+    else: 
+        return (gls)
diff --git a/pyard/util.py b/pyard/util.py
@@ -1,8 +1,8 @@
 # -*- coding: utf-8 -*-
 
 #
-#    seqann Sequence Annotation
-#    Copyright (c) 2017 Be The Match operated by National Marrow Donor Program. All Rights Reserved.
+#    pyard pyARD
+#    Copyright (c) 2018 Be The Match operated by National Marrow Donor Program. All Rights Reserved.
 #
 #    This library is free software; you can redistribute it and/or modify it
 #    under the terms of the GNU Lesser General Public License as published

Original file line number	Diff line number	Diff line change
`@@ -1,8 +1,8 @@`
`1`	`1`	`# -- coding: utf-8 --`
`2`	`2`
`3`	`3`	`#`
`4`		`-# seqann Sequence Annotation`
`5`		`-# Copyright (c) 2017 Be The Match operated by National Marrow Donor Program. All Rights Reserved.`
	`4`	`+# pyard pyARD`
	`5`	`+# Copyright (c) 2018 Be The Match operated by National Marrow Donor Program. All Rights Reserved.`
`6`	`6`	`#`
`7`	`7`	`# This library is free software; you can redistribute it and/or modify it`
`8`	`8`	`# under the terms of the GNU Lesser General Public License as published`