-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathmsa.py
More file actions
188 lines (157 loc) · 6.13 KB
/
msa.py
File metadata and controls
188 lines (157 loc) · 6.13 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
"Data type for storing a set of amino acid or nucleotide sequences."
from __future__ import absolute_import
from __future__ import print_function
import re
from sequence import Sequence
class MultipleSequenceAlignment(object):
"Represents a set of sequences."
def __init__(self, filename="", extension=None):
self._filename = filename
self._extension = extension
self._sequences = []
def __str__(self):
return self.filename
def __len__(self):
return len(self.sequences)
def __nonzero__(self):
return True
def __bool__(self):
return True
@property
def filename(self):
"""
The name of the file to which this multiple sequence alignment belongs.
"""
return self._filename
@filename.setter
def filename(self, value):
self._filename = value
@property
def extension(self):
"The file extension used for this multiple sequence alignment."
return self._extension
@extension.setter
def extension(self, value):
self._extension = value
@property
def sequences(self):
"Returns a list of all sequences in this alignment."
return self._sequences
@sequences.setter
def sequences(self, value):
self._sequences = value
def add_sequence(self, sequence=None, description="", sequence_data=""):
"Add a sequence object to the sequences list in this alignment object."
if not sequence:
sequence = Sequence()
sequence.description = description
sequence.sequence_data = sequence_data
sequence.otu = re.split(r"\||@", sequence.description)[0]
sequence.identifier = re.split(r"\||@", sequence.description)[1]
elif description:
sequence.otu = re.split(r"\||@", sequence.description)[0]
elif sequence_data:
sequence.identifier = re.split(r"\||@", sequence.description)[1]
self.sequences.append(sequence)
return sequence
def get_sequence(self, description):
"""
Takes a FASTA description as an input and returns the matching sequence
object, if a sequence with that description is found within this
alignment.
"""
for sequence in self.sequences:
if description == sequence.description:
return sequence
def iter_descriptions(self):
"""
Returns an iterator object that includes all sequence descriptions in
this alignment.
"""
for sequence in self.sequences:
yield sequence.description
def iter_otus(self):
"Returns an iterator object that includes all OTUs in this alignment."
for sequence in self.sequences:
yield sequence.otu
def iter_identifiers(self):
"Returns an iterator object that includes all IDs in this alignment."
for sequence in self.sequences:
yield sequence.identifier
def gaps(self):
"""Returns the number of gap characters ('-', '?', or 'x') within this
MultipleSequenceAlignment object.
Returns
-------
gaps : int
The number of gap characters within this MultipleSequenceAlignment
object.
"""
gaps = 0
for sequence in self.sequences:
gaps += self.alignment_len() - len(sequence.ungapped())
return gaps
def missing_data(self, otus_missing=0):
"""Returns the percent missing data within this
MultipleSequenceAlignment object. The amount of missing data is
calculated as follows: Sum the number of gap characters ('-', '?', or
'x') within each alignment and then divide this number by the total
number of positions within all alignments. Treat each OTU that is missing
from the alignment (set by the user; DEFAULT: 0), as having gaps
equivalent to the entire length of this MultipleSequenceAlignment
object.
Parameters
----------
otus_missing : int
The number of OTUs missing from this alignment (used to calculate
missing data within a supermatrix; 0 by default).
Returns
-------
missing_data : float
The percent missing data within this alignment.
"""
gaps = 0
no_of_sequences = float(len(self))
alignment_len = self.alignment_len()
gaps += self.gaps()
gaps += otus_missing * alignment_len
if no_of_sequences > 0:
return float(gaps) / (float(alignment_len) *
float(no_of_sequences + otus_missing))
else:
return 0
def otus(self):
"Returns a list of the OTUs present within this alignment."
otus_in_alignment = set()
for sequence in self.sequences:
if not sequence.otu in otus_in_alignment:
otus_in_alignment.add(sequence.otu)
return otus_in_alignment
def alignment_len(self):
"""
Returns the length of this alignment.
"""
# first_position = 0
# last_position = 0
# for seq in self.sequences:
# if not seq.sequence_data.startswith("-"):
# gaps_start = "-"
# while seq.sequence_data.startswith(gaps_start):
# gaps_start += "-"
# if not first_position or first_position > len(gaps_start) + 1:
# print("first: " + gaps_start)
# first_position = len(gaps_start) + 1
# else:
# gaps_start = ""
# if seq.sequence_data.endswith("-"):
# gaps_stop = "-"
# while seq.sequence_data.endswith(gaps_stop):
# gaps_stop += "-"
# if not last_position or last_position > len(gaps_stop) + 1:
# print("last: " + gaps_stop)
# last_position = len(gaps_stop) + 1
# else:
# gaps_stop = ""
# print(len(self.sequences[0]))
# print(len(self.sequences[0]) - (first_position + last_position))
return len(self.sequences[0])