orthoquoll/msa.py at master · fethalen/orthoquoll · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
"Data type for storing a set of amino acid or nucleotide sequences."

from __future__ import absolute_import
from __future__ import print_function
import re
from sequence import Sequence

class MultipleSequenceAlignment(object):
    "Represents a set of sequences."
    def __init__(self, filename="", extension=None):
        self._filename = filename
        self._extension = extension
        self._sequences = []

    def __str__(self):
        return self.filename

    def __len__(self):
        return len(self.sequences)

    def __nonzero__(self):
        return True

    def __bool__(self):
        return True

    @property
    def filename(self):
        """
        The name of the file to which this multiple sequence alignment belongs.
        """
        return self._filename

    @filename.setter
    def filename(self, value):
        self._filename = value

    @property
    def extension(self):
        "The file extension used for this multiple sequence alignment."
        return self._extension

    @extension.setter
    def extension(self, value):
        self._extension = value

    @property
    def sequences(self):
        "Returns a list of all sequences in this alignment."
        return self._sequences

    @sequences.setter
    def sequences(self, value):
        self._sequences = value

    def add_sequence(self, sequence=None, description="", sequence_data=""):
        "Add a sequence object to the sequences list in this alignment object."
        if not sequence:
            sequence = Sequence()
            sequence.description = description
            sequence.sequence_data = sequence_data
            sequence.otu = re.split(r"\||@", sequence.description)[0]
            sequence.identifier = re.split(r"\||@", sequence.description)[1]
        elif description:
            sequence.otu = re.split(r"\||@", sequence.description)[0]
        elif sequence_data:
            sequence.identifier = re.split(r"\||@", sequence.description)[1]

        self.sequences.append(sequence)
        return sequence

    def get_sequence(self, description):
        """
        Takes a FASTA description as an input and returns the matching sequence
        object, if a sequence with that description is found within this
        alignment.
        """
        for sequence in self.sequences:
            if description == sequence.description:
                return sequence

    def iter_descriptions(self):
        """
        Returns an iterator object that includes all sequence descriptions in
        this alignment.
        """
        for sequence in self.sequences:
            yield sequence.description

    def iter_otus(self):
        "Returns an iterator object that includes all OTUs in this alignment."
        for sequence in self.sequences:
            yield sequence.otu

    def iter_identifiers(self):
        "Returns an iterator object that includes all IDs in this alignment."
        for sequence in self.sequences:
            yield sequence.identifier

    def gaps(self):
        """Returns the number of gap characters ('-', '?', or 'x') within this
        MultipleSequenceAlignment object.

        Returns
        -------
        gaps : int
            The number of gap characters within this MultipleSequenceAlignment
            object.
        """
        gaps = 0

        for sequence in self.sequences:
            gaps += self.alignment_len() - len(sequence.ungapped())

        return gaps

    def missing_data(self, otus_missing=0):
        """Returns the percent missing data within this
        MultipleSequenceAlignment object. The amount of missing data is
        calculated as follows: Sum the number of gap characters ('-', '?', or
        'x') within each alignment and then divide this number by the total
        number of positions within all alignments. Treat each OTU that is missing
        from the alignment (set by the user; DEFAULT: 0), as having gaps
        equivalent to the entire length of this MultipleSequenceAlignment
        object.

        Parameters
        ----------
        otus_missing : int
            The number of OTUs missing from this alignment (used to calculate
            missing data within a supermatrix; 0 by default).

        Returns
        -------
        missing_data : float
            The percent missing data within this alignment.
        """
        gaps = 0
        no_of_sequences = float(len(self))
        alignment_len = self.alignment_len()
        gaps += self.gaps()
        gaps += otus_missing * alignment_len

        if no_of_sequences > 0:
            return float(gaps) / (float(alignment_len) *
                                  float(no_of_sequences + otus_missing))
        else:
            return 0

    def otus(self):
        "Returns a list of the OTUs present within this alignment."
        otus_in_alignment = set()
        for sequence in self.sequences:
            if not sequence.otu in otus_in_alignment:
                otus_in_alignment.add(sequence.otu)
        return otus_in_alignment

    def alignment_len(self):
        """
        Returns the length of this alignment.
        """
        # first_position = 0
        # last_position = 0
        # for seq in self.sequences:

        #     if not seq.sequence_data.startswith("-"):
        #         gaps_start = "-"
        #         while seq.sequence_data.startswith(gaps_start):
        #             gaps_start += "-"
        #         if not first_position or first_position > len(gaps_start) + 1:
        #             print("first: " + gaps_start)
        #             first_position = len(gaps_start) + 1
        #     else:
        #         gaps_start = ""

        #     if seq.sequence_data.endswith("-"):
        #         gaps_stop = "-"
        #         while seq.sequence_data.endswith(gaps_stop):
        #             gaps_stop += "-"
        #         if not last_position or last_position > len(gaps_stop) + 1:
        #             print("last: " + gaps_stop)
        #             last_position = len(gaps_stop) + 1
        #     else:
        #         gaps_stop = ""

        # print(len(self.sequences[0]))
        # print(len(self.sequences[0]) - (first_position + last_position))
        return len(self.sequences[0])