|
1 | 1 | #!/usr/bin/python3 |
2 | 2 |
|
3 | 3 | import sys |
| 4 | +#from xmlrpc.server import MultiPathXMLRPCServer |
4 | 5 |
|
5 | 6 | try: |
6 | 7 | minlength = int(sys.argv[1]) |
|
14 | 15 | if (int(end) - int(start) >= minlength): |
15 | 16 | if chr not in divided.keys(): |
16 | 17 | divided[chr]={} |
17 | | - divided[chr][start] = end |
| 18 | + divided[chr][start] = int(end) |
| 19 | + |
| 20 | +if not divided: |
| 21 | + print("No blast matches found!") |
| 22 | + exit(0) |
| 23 | + |
| 24 | +# blast file columns |
| 25 | +# 0: qseqid, 1: sseqid, 2: pident, 3: length, 4: mismatch, 5: gapopen, 6: qstart, 7: qend, 8: sstart, 9: send, 10: evalue, 11: bitscore |
18 | 26 |
|
19 | 27 | i=0 |
20 | 28 | group = {str(i) : {'list': [], 'length' : 0}} |
21 | 29 | lengths = [] |
22 | 30 | matches = [] |
23 | 31 | with open("combine/blast_hits.txt", 'r') as file : |
24 | 32 | for line in file.readlines() : |
25 | | - chr1 = line.split()[0] |
26 | | - chr2 = line.split()[1] |
27 | | - start1 = line.split()[6] |
28 | | - end1 = line.split()[7] |
29 | | - start2 = line.split()[8] |
30 | | - end2 = line.split()[9] |
31 | | - for key in divided[chr1].keys(): |
32 | | - if int(start1) <= int(key) and int(end1) >= int(divided[chr1][key]): |
33 | | - start_diff = int(key) - int(start1) |
34 | | - end_diff = int(end1) - int(divided[chr1][key]) |
35 | | - saved_line= chr1 + ":" + key + ':' + divided[chr1][key] |
36 | | - match_length = int(divided[chr1][key]) - int(key) |
37 | | - if int(start2) < int(end2): |
38 | | - match_start = int(start2) + start_diff |
39 | | - match_end = int(end2) - end_diff |
40 | | - elif (int(start2) > int(end2)): |
41 | | - match_start = int(end2) + int(end_diff) |
42 | | - match_end = int(start2) - int(start_diff) |
43 | | - |
44 | | - if str(match_start) not in divided[chr2].keys(): |
45 | | - for potential_start in divided[chr2].keys(): |
46 | | - if (int(potential_start) - 20) <= match_start <= (int(potential_start) + 20): |
47 | | - match_start=int(potential_start) |
48 | | - |
49 | | - # sometimes there are small indels |
50 | | - if str(match_start) in divided[chr2].keys(): |
51 | | - if int(divided[chr2][str(match_start)]) != int(match_end): |
52 | | - match_end=int(divided[chr2][str(match_start)]) |
53 | | - |
54 | | - saved_line2= chr2 + ":" + str(match_start) + ":" + str(match_end) |
55 | | - new_group = True |
56 | | - |
57 | | - for numbers in group.keys(): |
58 | | - if saved_line in group[numbers]['list']: |
59 | | - new_group = False |
60 | | - if (saved_line2 not in group[numbers]['list']) and (start2 in divided[chr2].keys()): |
61 | | - group[numbers]['list'].append(saved_line2) |
62 | | - elif saved_line2 in group[numbers]['list']: |
63 | | - new_group = False |
64 | | - if saved_line not in group[numbers]['list']: |
65 | | - group[numbers]['list'].append(saved_line) |
66 | | - |
67 | | - if new_group: |
68 | | - i += 1 |
69 | | - group[str(i)] = { 'list': [saved_line, saved_line2], "length" : match_length } |
70 | | - lengths.append(match_length) |
| 33 | + qseqid = line.split()[0] |
| 34 | + sseqid = line.split()[1] |
| 35 | + qstart = int(line.split()[6]) |
| 36 | + qend = int(line.split()[7]) |
| 37 | + sstart = int(line.split()[8]) |
| 38 | + send = int(line.split()[9]) |
| 39 | + |
| 40 | + for start in divided[qseqid].keys(): |
| 41 | + saved_line = '' |
| 42 | + start_diff = 0 |
| 43 | + end_diff = 0 |
| 44 | + |
| 45 | + # looking to see if potential groups in divided apply to this line |
| 46 | + if int(start) >= qstart and divided[qseqid][start] <= qend: |
| 47 | + start_diff = int(start) - qstart |
| 48 | + end_diff = qend - divided[qseqid][start] |
| 49 | + saved_line = qseqid + ":" + start + ':' + str(divided[qseqid][start]) |
| 50 | + match_length = divided[qseqid][start] - int(start) |
| 51 | + |
| 52 | + if sstart < send: |
| 53 | + match_start = sstart + start_diff |
| 54 | + match_end = send - end_diff |
| 55 | + elif sstart > send: |
| 56 | + match_start = send + end_diff |
| 57 | + match_end = sstart - start_diff |
| 58 | + |
| 59 | + match_found = False |
| 60 | + if str(match_start) not in divided[sseqid].keys(): |
| 61 | + for start_fix in divided[sseqid].keys(): |
| 62 | + if int(start_fix) - 50 <= int(match_start) <= int(start_fix) + 50 : |
| 63 | + match_start = start_fix |
| 64 | + if divided[sseqid][start_fix] - 50 <= match_end <= divided[sseqid][start_fix] + 50 : |
| 65 | + match_end = divided[sseqid][start_fix] |
| 66 | + match_found = True |
| 67 | + else : |
| 68 | + print("no match was found for 2 : " + line) |
| 69 | + else: |
| 70 | + if divided[sseqid][str(match_start)] - 50 <= match_end <= divided[sseqid][str(match_start)] + 50 : |
| 71 | + match_end = divided[sseqid][str(match_start)] |
| 72 | + match_found = True |
| 73 | + |
| 74 | + saved_line2 = sseqid + ":" + str(match_start) + ":" + str(match_end) |
| 75 | + |
| 76 | + if match_found : |
| 77 | + new_group = True |
| 78 | + for saved in group.keys(): |
| 79 | + if saved_line in group[saved]['list']: |
| 80 | + new_group = False |
| 81 | + if (saved_line2 not in group[saved]['list']): |
| 82 | + group[saved]['list'].append(saved_line2) |
| 83 | + elif saved_line2 in group[saved]['list']: |
| 84 | + new_group = False |
| 85 | + if saved_line not in group[saved]['list']: |
| 86 | + group[saved]['list'].append(saved_line) |
| 87 | + |
| 88 | + if new_group: |
| 89 | + i += 1 |
| 90 | + group[str(i)] = { 'list': [saved_line, saved_line2], "length" : match_length } |
| 91 | + lengths.append(match_length) |
71 | 92 | del group["0"] |
72 | 93 |
|
73 | 94 | # adjusting duplicate groups |
|
0 commit comments