This repository was archived by the owner on Dec 13, 2024. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathmain.py
More file actions
86 lines (73 loc) · 2.05 KB
/
main.py
File metadata and controls
86 lines (73 loc) · 2.05 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
from seqfold import fold, dot_bracket
import os
import sys
sys.setrecursionlimit(1000000)
# load the reference genome sequence file
ref_genome = open("data/GRCh38_latest_genomic.fna", "r")
# get next item from the FASTA file
def get_next_item(file):
# read the first line
line = file.readline()
# check if the line is empty
if not line:
return None
# get the sequence id
seq_id = line[1:].strip()
# read the sequence
seq = ""
while True:
line = file.readline()
if not line or line.startswith(">"):
break
seq += line.strip()
# return seq_id, seq.upper()
return seq.upper()
# get all sequences starting at 'CGCGCG' and ending at 'AATAA'
def get_seqs(seq):
seqs = []
start = 0
while True:
start = seq.find("CGCGCG", start)
if start == -1:
break
end = seq.find("AATAA", start)
if end == -1:
break
seqs.append(seq[start:end+5])
start = end + 5
return seqs
def to_rna(dna):
rna = ""
for i in dna:
if i == 'G':
rna += 'C'
elif i == 'C':
rna += 'G'
elif i == 'T':
rna += 'A'
elif i == 'A':
rna += 'U'
return rna
# get all transcripts
transcripts = []
seq = get_next_item(ref_genome)
while seq:
transcripts.extend(get_seqs(seq))
seq = get_next_item(ref_genome)
# convert all transcripts to RNA
rna_transcripts = [to_rna(transcript) for transcript in transcripts]
i = 0
for transcript in rna_transcripts:
i += 1
# skip if folded/{i} already exists
if os.path.exists(f"folded/{i}"):
continue
# create a directory folded/{i}
os.makedirs(f"folded/{i}", exist_ok=True)
print(f"Folding transcript {i}")
folded_transcript = fold(transcript)
# save to file folded/{i}/fold
with open(f"folded/{i}/fold", "w") as f:
f.write("\n".join([str(x) for x in folded_transcript]))
with open(f"folded/{i}/seq", "w") as f:
f.write(transcript)