Skip to content

Commit 0d31a20

Browse files
authored
Merge pull request #8 from erinyoung/erin-dev
Erin dev
2 parents 86ee772 + fd89004 commit 0d31a20

File tree

12 files changed

+290
-170
lines changed

12 files changed

+290
-170
lines changed

bin/divide.py

Lines changed: 64 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,64 @@
1+
#!/usr/bin/python3
2+
3+
import sys
4+
5+
hits=sys.argv[1]
6+
try:
7+
minlength = int(sys.argv[2])
8+
except:
9+
minlength = 500
10+
11+
# blast file columns
12+
# 0: qseqid, 1: sseqid, 2: pident, 3: length, 4: mismatch, 5: gapopen, 6: qstart, 7: qend, 8: sstart, 9: send, 10: evalue, 11: bitscore
13+
14+
starts = []
15+
ends = []
16+
chr = ''
17+
with open(hits, 'r') as file :
18+
for line in file.readlines() :
19+
if int(line.split()[3]) >= minlength and round(float(line.split()[2]) - 90.0) >= 0:
20+
chr = line.split()[0]
21+
start = int(line.split()[6])
22+
end = int(line.split()[7])
23+
if start not in starts:
24+
starts.append(start)
25+
if end not in ends:
26+
ends.append(end)
27+
28+
starts.sort()
29+
ends.sort()
30+
31+
divisions = []
32+
with open(hits, 'r') as file :
33+
for line in file.readlines() :
34+
if int(line.split()[3]) >= minlength and round(float(line.split()[2]) - 90.0) >= 0:
35+
start = int(line.split()[6])
36+
end = int(line.split()[7])
37+
mids = []
38+
for mid in starts:
39+
if start <= mid < end:
40+
mids.append(mid)
41+
for mid in ends:
42+
if start < mid < end:
43+
mids.append(mid+1)
44+
mids.sort()
45+
if len(mids) == 1:
46+
if end - start >= minlength:
47+
div=str(start) + "-" + str(end)
48+
if div not in divisions:
49+
divisions.append(div)
50+
else:
51+
for i in range(1, len(mids)):
52+
if mids[i]-1 - mids [i-1] >= minlength:
53+
div=str(mids[i-1]) + "-" + str(mids[i]-1)
54+
if div not in divisions:
55+
divisions.append(div)
56+
if end - mids[-1] >= minlength:
57+
div=str(mids[-1]) + "-" + str(end)
58+
if div not in divisions:
59+
divisions.append(div)
60+
61+
file = open(chr + ".divided.bed", "w")
62+
for div in divisions:
63+
start,end = div.split("-")
64+
file.write(chr + "\t" + start + "\t" + end + "\n")

bin/divide.sh

Lines changed: 0 additions & 38 deletions
This file was deleted.

bin/groups.py

Lines changed: 68 additions & 47 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11
#!/usr/bin/python3
22

33
import sys
4+
#from xmlrpc.server import MultiPathXMLRPCServer
45

56
try:
67
minlength = int(sys.argv[1])
@@ -14,60 +15,80 @@
1415
if (int(end) - int(start) >= minlength):
1516
if chr not in divided.keys():
1617
divided[chr]={}
17-
divided[chr][start] = end
18+
divided[chr][start] = int(end)
19+
20+
if not divided:
21+
print("No blast matches found!")
22+
exit(0)
23+
24+
# blast file columns
25+
# 0: qseqid, 1: sseqid, 2: pident, 3: length, 4: mismatch, 5: gapopen, 6: qstart, 7: qend, 8: sstart, 9: send, 10: evalue, 11: bitscore
1826

1927
i=0
2028
group = {str(i) : {'list': [], 'length' : 0}}
2129
lengths = []
2230
matches = []
2331
with open("combine/blast_hits.txt", 'r') as file :
2432
for line in file.readlines() :
25-
chr1 = line.split()[0]
26-
chr2 = line.split()[1]
27-
start1 = line.split()[6]
28-
end1 = line.split()[7]
29-
start2 = line.split()[8]
30-
end2 = line.split()[9]
31-
for key in divided[chr1].keys():
32-
if int(start1) <= int(key) and int(end1) >= int(divided[chr1][key]):
33-
start_diff = int(key) - int(start1)
34-
end_diff = int(end1) - int(divided[chr1][key])
35-
saved_line= chr1 + ":" + key + ':' + divided[chr1][key]
36-
match_length = int(divided[chr1][key]) - int(key)
37-
if int(start2) < int(end2):
38-
match_start = int(start2) + start_diff
39-
match_end = int(end2) - end_diff
40-
elif (int(start2) > int(end2)):
41-
match_start = int(end2) + int(end_diff)
42-
match_end = int(start2) - int(start_diff)
43-
44-
if str(match_start) not in divided[chr2].keys():
45-
for potential_start in divided[chr2].keys():
46-
if (int(potential_start) - 20) <= match_start <= (int(potential_start) + 20):
47-
match_start=int(potential_start)
48-
49-
# sometimes there are small indels
50-
if str(match_start) in divided[chr2].keys():
51-
if int(divided[chr2][str(match_start)]) != int(match_end):
52-
match_end=int(divided[chr2][str(match_start)])
53-
54-
saved_line2= chr2 + ":" + str(match_start) + ":" + str(match_end)
55-
new_group = True
56-
57-
for numbers in group.keys():
58-
if saved_line in group[numbers]['list']:
59-
new_group = False
60-
if (saved_line2 not in group[numbers]['list']) and (start2 in divided[chr2].keys()):
61-
group[numbers]['list'].append(saved_line2)
62-
elif saved_line2 in group[numbers]['list']:
63-
new_group = False
64-
if saved_line not in group[numbers]['list']:
65-
group[numbers]['list'].append(saved_line)
66-
67-
if new_group:
68-
i += 1
69-
group[str(i)] = { 'list': [saved_line, saved_line2], "length" : match_length }
70-
lengths.append(match_length)
33+
qseqid = line.split()[0]
34+
sseqid = line.split()[1]
35+
qstart = int(line.split()[6])
36+
qend = int(line.split()[7])
37+
sstart = int(line.split()[8])
38+
send = int(line.split()[9])
39+
40+
for start in divided[qseqid].keys():
41+
saved_line = ''
42+
start_diff = 0
43+
end_diff = 0
44+
45+
# looking to see if potential groups in divided apply to this line
46+
if int(start) >= qstart and divided[qseqid][start] <= qend:
47+
start_diff = int(start) - qstart
48+
end_diff = qend - divided[qseqid][start]
49+
saved_line = qseqid + ":" + start + ':' + str(divided[qseqid][start])
50+
match_length = divided[qseqid][start] - int(start)
51+
52+
if sstart < send:
53+
match_start = sstart + start_diff
54+
match_end = send - end_diff
55+
elif sstart > send:
56+
match_start = send + end_diff
57+
match_end = sstart - start_diff
58+
59+
match_found = False
60+
if str(match_start) not in divided[sseqid].keys():
61+
for start_fix in divided[sseqid].keys():
62+
if int(start_fix) - 50 <= int(match_start) <= int(start_fix) + 50 :
63+
match_start = start_fix
64+
if divided[sseqid][start_fix] - 50 <= match_end <= divided[sseqid][start_fix] + 50 :
65+
match_end = divided[sseqid][start_fix]
66+
match_found = True
67+
else :
68+
print("no match was found for 2 : " + line)
69+
else:
70+
if divided[sseqid][str(match_start)] - 50 <= match_end <= divided[sseqid][str(match_start)] + 50 :
71+
match_end = divided[sseqid][str(match_start)]
72+
match_found = True
73+
74+
saved_line2 = sseqid + ":" + str(match_start) + ":" + str(match_end)
75+
76+
if match_found :
77+
new_group = True
78+
for saved in group.keys():
79+
if saved_line in group[saved]['list']:
80+
new_group = False
81+
if (saved_line2 not in group[saved]['list']):
82+
group[saved]['list'].append(saved_line2)
83+
elif saved_line2 in group[saved]['list']:
84+
new_group = False
85+
if saved_line not in group[saved]['list']:
86+
group[saved]['list'].append(saved_line)
87+
88+
if new_group:
89+
i += 1
90+
group[str(i)] = { 'list': [saved_line, saved_line2], "length" : match_length }
91+
lengths.append(match_length)
7192
del group["0"]
7293

7394
# adjusting duplicate groups

0 commit comments

Comments
 (0)