Ram_Y_complex/sequence_reversal.py at master · gwlilabmit/Ram_Y_complex · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
from string_reversal import *
from base_complement import *

print "This script will only slice windows of a user-specified size in the genome file provided--if you need to reverse complement or anything like that, run that before running this script."
print "For now it'll get you windows symmetric around the cleavage site. I'll build in more options to get different types of windows as needed later."

#middle = int(raw_input("At what index [you should be able to find this on Mochiview] is the cleavage site? \n"))

###I'm getting some weird index offsets between what I'm seeing in mochiview and the file itself...bear with me
genetype = raw_input("What cleavage site is this? \n")
genetype = genetype.lower()

#bear with me here
#finding the index in NC_000964 for each cleavage site

if 'cggr' in genetype:
	middle = 3482769
	offset = 49756 #why the weird offsets? I'm basing this off what I see in Mochiview...if I don't do this my window is not at the indices I'd expect
	middle += offset
elif 'atpi' in genetype:
	middle = 3783696
	offset = 49756+8237+127
	middle += offset

elif 'cwlo' in genetype:
	middle = 3575886
	offset = 49756+8237+127-6925-98
	middle += offset

elif 'daca' in genetype:
	middle = 17476
	offset = 263
	middle += offset

elif 'ddl' in genetype:
	middle = 508162
	offset = 7169+103
	middle += offset

elif 'glna' in genetype:
	middle = 1878263
	offset = 2650+23853+341
	middle += offset

elif 'mena' in genetype:
	middle = 3951710
	offset = 55669+796
	middle += offset

#defining window size
windowsize = int(raw_input("How big should be the window be?\n"))

if windowsize%2==1:
	print "Your window size is odd so I'm going to assume you want two even-sized windows flanking the cleavage site."
	windowsize -=1
else:
	print "Your window size is even so I'm going to assume you want two even-sized windows flanking the cleavage site."

flanking = windowsize/2

start = middle - flanking
if start<0:
	print "Your window size is out of bounds so I'm going to start it at the beginning of the file."
if windowsize%2==0:
	end = middle+flanking+2 #adding the +1 because of how Python slices arrays


#open genome file and extract window

genomefile = "../B_subtilis_NC_000964.txt"  #edit path as needed
#genomefile = "hello.txt" #use for debugging purposes
genome = open(genomefile, 'r')
genomelist = []
for line in genome:
	if '>' in line:
		pass
	for char in line:
		genomelist.append(char)
if end>len(genomelist):
	print "Your window size is out of bounds so I'm going to end it at the end of the file."
	end = len(genomelist)
genome.close()
window = genomelist[start:end]
endwindow = "".join(window)
finalwindow = endwindow.rstrip('\n')

#reverse complement if the transcript is on the minus strand
#either way, print out the window here
strandedness = raw_input("Is this on the + strand?\n")
if 'n' in strandedness or 'N' in strandedness:
	reverse = reversal(finalwindow)
	comp = seqcomplement(reverse, 'DNA')
	print "Here's the window for the minus strand: \n"
	print comp
else:
	print "Here's the window for the plus strand: \n"
	print finalwindow