erinyoung
diff --git a/‎bin/divide.py‎
Lines changed: 64 additions & 0 deletions b/‎bin/divide.py‎
Lines changed: 64 additions & 0 deletions
diff --git a/‎bin/divide.sh‎
Lines changed: 0 additions & 38 deletions b/‎bin/divide.sh‎
Lines changed: 0 additions & 38 deletions
diff --git a/‎bin/groups.py‎
Lines changed: 68 additions & 47 deletions b/‎bin/groups.py‎
Lines changed: 68 additions & 47 deletions
@@ -0,0 +1,64 @@
+#!/usr/bin/python3
+
+import sys
+
+hits=sys.argv[1]
+try:
+    minlength = int(sys.argv[2])
+except:
+    minlength = 500
+
+# blast file columns
+# 0: qseqid, 1: sseqid, 2: pident, 3: length, 4: mismatch, 5: gapopen, 6: qstart, 7: qend, 8: sstart, 9: send, 10: evalue, 11: bitscore
+
+starts = []
+ends   = []
+chr    = ''
+with open(hits, 'r') as file :
+    for line in file.readlines() :
+        if int(line.split()[3]) >= minlength and round(float(line.split()[2]) - 90.0) >= 0:
+            chr   = line.split()[0]
+            start = int(line.split()[6])
+            end   = int(line.split()[7])
+            if start not in starts:
+                starts.append(start)
+            if end not in ends:
+                ends.append(end)
+
+starts.sort()
+ends.sort()
+
+divisions = []
+with open(hits, 'r') as file :
+    for line in file.readlines() :
+        if int(line.split()[3]) >= minlength and round(float(line.split()[2]) - 90.0) >= 0:
+            start = int(line.split()[6])
+            end   = int(line.split()[7])
+            mids  = []
+            for mid in starts:
+                if start <= mid < end:
+                    mids.append(mid)
+            for mid in ends:
+                if start < mid < end:
+                    mids.append(mid+1)
+            mids.sort()
+            if len(mids) == 1:
+                if end - start >= minlength:
+                    div=str(start) + "-" + str(end)
+                    if div not in divisions:
+                        divisions.append(div)
+            else:
+                for i in range(1, len(mids)):
+                    if mids[i]-1 - mids [i-1] >= minlength:
+                        div=str(mids[i-1]) + "-" + str(mids[i]-1)
+                        if div not in divisions:
+                            divisions.append(div)
+                if end - mids[-1] >= minlength:
+                    div=str(mids[-1]) + "-" + str(end)
+                    if div not in divisions:
+                        divisions.append(div)
+
+file = open(chr + ".divided.bed", "w")
+for div in divisions:
+    start,end = div.split("-")
+    file.write(chr + "\t" + start + "\t" + end + "\n")
@@ -1,6 +1,7 @@
 #!/usr/bin/python3
 
 import sys
+#from xmlrpc.server import MultiPathXMLRPCServer
 
 try:
     minlength = int(sys.argv[1])
@@ -14,60 +15,80 @@
         if (int(end) - int(start) >= minlength):
             if chr not in divided.keys():
                 divided[chr]={}
-            divided[chr][start] = end
+            divided[chr][start] = int(end)
+
+if not divided:
+    print("No blast matches found!")
+    exit(0)
+
+# blast file columns
+# 0: qseqid, 1: sseqid, 2: pident, 3: length, 4: mismatch, 5: gapopen, 6: qstart, 7: qend, 8: sstart, 9: send, 10: evalue, 11: bitscore
 
 i=0
 group = {str(i) : {'list': [], 'length' : 0}}
 lengths = []
 matches = []
 with open("combine/blast_hits.txt", 'r') as file :
     for line in file.readlines() :
-        chr1 = line.split()[0]
-        chr2 = line.split()[1]
-        start1 = line.split()[6]
-        end1 = line.split()[7]
-        start2 = line.split()[8]
-        end2 = line.split()[9]
-        for key in divided[chr1].keys():
-            if int(start1) <= int(key) and int(end1) >= int(divided[chr1][key]):
-                start_diff = int(key) - int(start1)
-                end_diff = int(end1) - int(divided[chr1][key])
-                saved_line= chr1 + ":" + key + ':' + divided[chr1][key]
-                match_length = int(divided[chr1][key]) - int(key)             
-                if int(start2) < int(end2):
-                    match_start = int(start2) + start_diff
-                    match_end = int(end2) - end_diff
-                elif (int(start2) > int(end2)):
-                    match_start = int(end2) + int(end_diff)
-                    match_end = int(start2) - int(start_diff)
-
-                if str(match_start) not in divided[chr2].keys():
-                    for potential_start in divided[chr2].keys():
-                        if (int(potential_start) - 20) <= match_start <= (int(potential_start) + 20):
-                            match_start=int(potential_start)
-
-                # sometimes there are small indels
-                if str(match_start) in divided[chr2].keys():
-                    if int(divided[chr2][str(match_start)]) != int(match_end):
-                        match_end=int(divided[chr2][str(match_start)])
-
-                saved_line2= chr2 + ":" + str(match_start) + ":" + str(match_end)
-                new_group = True
-
-                for numbers in group.keys():
-                    if saved_line in group[numbers]['list']:
-                        new_group = False
-                        if (saved_line2 not in group[numbers]['list']) and (start2 in divided[chr2].keys()):
-                            group[numbers]['list'].append(saved_line2)      
-                    elif saved_line2 in group[numbers]['list']:
-                        new_group = False
-                        if saved_line not in group[numbers]['list']:
-                            group[numbers]['list'].append(saved_line)
-
-                if new_group:
-                    i += 1
-                    group[str(i)] = { 'list': [saved_line, saved_line2], "length" : match_length }
-                    lengths.append(match_length)
+        qseqid      = line.split()[0]
+        sseqid      = line.split()[1]
+        qstart      = int(line.split()[6])
+        qend        = int(line.split()[7])
+        sstart      = int(line.split()[8])
+        send        = int(line.split()[9])
+
+        for start in divided[qseqid].keys():
+            saved_line          = ''
+            start_diff          = 0
+            end_diff            = 0
+
+            # looking to see if potential groups in divided apply to this line
+            if int(start) >= qstart and divided[qseqid][start] <= qend:
+                start_diff      = int(start) - qstart
+                end_diff        = qend - divided[qseqid][start]                
+                saved_line      = qseqid + ":" + start + ':' + str(divided[qseqid][start])
+                match_length    = divided[qseqid][start] - int(start)
+
+                if sstart < send:
+                    match_start = sstart + start_diff
+                    match_end   = send   - end_diff
+                elif sstart > send:
+                    match_start = send   + end_diff
+                    match_end   = sstart - start_diff
+
+                match_found = False
+                if str(match_start) not in divided[sseqid].keys():
+                    for start_fix in divided[sseqid].keys():
+                        if int(start_fix) - 50 <= int(match_start) <= int(start_fix) + 50 :
+                            match_start     = start_fix
+                            if divided[sseqid][start_fix] - 50 <= match_end <= divided[sseqid][start_fix] + 50 :
+                                match_end   = divided[sseqid][start_fix]
+                                match_found = True
+                            else :
+                                print("no match was found for 2 : " + line)
+                else:
+                    if divided[sseqid][str(match_start)] - 50 <= match_end <= divided[sseqid][str(match_start)] + 50 :
+                        match_end   = divided[sseqid][str(match_start)]
+                        match_found = True
+
+                saved_line2 = sseqid + ":" + str(match_start) + ":" + str(match_end)
+
+                if match_found :
+                    new_group = True
+                    for saved in group.keys():
+                        if saved_line in group[saved]['list']:
+                            new_group = False
+                            if (saved_line2 not in group[saved]['list']):
+                                group[saved]['list'].append(saved_line2)      
+                        elif saved_line2 in group[saved]['list']:
+                            new_group = False
+                            if saved_line not in group[saved]['list']:
+                                group[saved]['list'].append(saved_line)
+
+                    if new_group:
+                        i += 1
+                        group[str(i)] = { 'list': [saved_line, saved_line2], "length" : match_length }
+                        lengths.append(match_length)
 del group["0"]
 
 # adjusting duplicate groups