@@ -13,13 +13,42 @@ Return the corresponding index within the full sequence for each position in `ms
1313The two-argument form retrieves the sequenceindexes for the `i`th sequence in `msa`.
1414"""
1515function sequenceindexes end
16+ sequenceindexes (msa:: AbstractVector{FASTX.FASTA.Record} , i:: Int ) = sequenceindexes (msa[i], columnindexes (msa))
17+ function sequenceindexes (seq:: FASTX.FASTA.Record , keepcols:: AbstractVector{Int} )
18+ s = collect (sequence (seq))
19+ idx = findfirst (islowercase, s)
20+ offset = idx === nothing ? first (keepcols) : idx
21+ preambleidx = first (keepcols)
22+ filled = map (eachindex (s)) do j
23+ j < preambleidx || isuppercase (s[j])
24+ end
25+ cf = cumsum (filled)
26+ m = match (r" /(\d +)-(\d +)$" , identifier (seq))
27+ if m != = nothing
28+ start, stop = parse .(Int, m. captures)
29+ Δ = start - offset
30+ return (filled .* (cf .+ Δ))[keepcols]
31+ end
32+ return (filled .* cf)[keepcols]
33+ end
1634
1735"""
1836 idxs = columnindexes(msa)
1937
20- Return the indices (within the reference sequence) covered by the conserved columns of the MSA.
38+ Return the indices of the conserved columns of the MSA.
2139"""
2240function columnindexes end
41+ function columnindexes (msa:: AbstractVector{FASTX.FASTA.Record} )
42+ # Slow path: check each sequence, find all that have at least one uppercase in that column
43+ nseq = length (msa)
44+ ncol = length (sequence (msa[1 ]))
45+ keep = falses (ncol)
46+ for rec in msa
47+ s = collect (sequence (rec))
48+ keep .| = isuppercase .(s)
49+ end
50+ return findall (keep)
51+ end
2352
2453"""
2554 isgap(res)
@@ -28,6 +57,7 @@ Return `true` if the residue `res` is a gap.
2857"""
2958function isgap end
3059isgap (c:: Char ) = c == ' -'
60+ isgap (r:: Integer ) = iszero (r) # when mapped to integers, gaps are encoded as 0
3161
3262"""
3363 isunknown(res)
@@ -43,20 +73,26 @@ isunknown(c::Char) = c == 'X'
4373Return the keys (sequence names) of the MSA.
4474"""
4575function sequencekeys end
76+ sequencekeys (msa:: AbstractVector{FASTX.FASTA.Record} ) = eachindex (msa)
4677
4778"""
4879 seq = msasequence(msa, key)
4980
5081Return the aligned sequence corresponding to `key`.
5182"""
5283function msasequence end
84+ msasequence (msa:: AbstractVector{FASTX.FASTA.Record} , key:: Int ) = sequence (msa[key])
5385
5486"""
5587 R = residuematrix(msa)
5688
5789Get all residues in the MSA as a matrix, one sequence per row.
5890"""
5991function residuematrix end
92+ function residuematrix (msa:: AbstractVector{FASTX.FASTA.Record} )
93+ M = reduce (vcat, [permutedims (collect (sequence (rec))) for rec in msa])
94+ return M[:, columnindexes (msa)]
95+ end
6096
6197"""
6298 msaview = subseqs(msa, rowindexes::AbstractVector{Int})
@@ -70,14 +106,80 @@ Construct a reduced-size `msaview`, keeping only the sequences corresponding to
70106function subseqs end
71107function subseqs! end
72108
109+ subseqs (msa:: AbstractVector{FASTX.FASTA.Record} , rowmask) = msa[rowmask]
110+ subseqs! (msa:: AbstractVector{FASTX.FASTA.Record} , rowmask:: AbstractVector{Bool} ) =
111+ deleteat! (msa, findall (! , rowmask))
112+ subseqs! (msa:: AbstractVector{FASTX.FASTA.Record} , rowindexes:: AbstractVector{Int} ) =
113+ deleteat! (msa, setdiff (1 : length (msa), rowindexes))
114+
115+ # # End required API, but some can specialize other methods
116+
73117"""
74118 pc = percent_similarity(msa)
119+ pc = percent_similarity(f, msa)
75120
76121Compute the percent similarity between all pairs of sequences in `msa`.
77122`pc[i, j]` is the percent similarity between sequences `i` and `j`.
123+
124+ Optionally apply mapping function `f` to each residue before computing
125+ similarity.
78126"""
79127function percent_similarity end
80128
129+ function percent_similarity (f, msa)
130+ # This mimics MIToS's implementation
131+ function pctsim (v1, v2)
132+ same = l = 0
133+ for (a, b) in zip (v1, v2)
134+ isgap (a) && isgap (b) && continue # skip gaps
135+ same += a == b
136+ l += 1
137+ end
138+ return 100 * same / l
139+ end
140+
141+ M = f .(residuematrix (msa))
142+ n = size (M, 1 )
143+ S = zeros (Float64, n, n)
144+ for i in 1 : n
145+ for j in i: n
146+ S[i, j] = pctsim (M[i, :], M[j, :])
147+ S[j, i] = S[i, j]
148+ end
149+ end
150+ return S
151+ end
152+ percent_similarity (msa) = percent_similarity (reduced_alphabet, msa)
153+
154+ function reduced_alphabet (r:: Char )
155+ if r == ' -'
156+ return 0
157+ elseif r in (' A' ,' I' ,' L' ,' M' ,' V' )
158+ return 1 # hydrophobic
159+ elseif r in (' N' ,' Q' ,' S' ,' T' )
160+ return 2 # polar
161+ elseif r in (' R' ,' H' ,' K' )
162+ return 3 # charged
163+ elseif r in (' D' ,' E' )
164+ return 4 # charged
165+ elseif r in (' F' ,' W' ,' Y' )
166+ return 5 # aromatic
167+ end
168+ offset = findfirst (== (r), (' C' ,' G' ,' P' ))
169+ offset === nothing && throw (ArgumentError (" Unknown residue '$r '" ))
170+ return 5 + offset # special or unknown
171+ end
172+
173+ columnwise_entropy (msa) = columnwise_entropy (reduced_alphabet, msa)
174+
175+ # Notes on interpreting letter codes in the "GC.seq_cons" field:
176+ # - `.` indicates a gap
177+ # - uppercase single-letter amino acid codes indicate strong consensus (>60%)
178+ # - lowercase single-letter codes (likely interpretations):
179+ # + 'a': aromatic
180+ # + 'h': hydrophobic
181+ # + rest unknown
182+ # - '+' and '-' indicate positively- and negatively-charged residues, respectively
81183
82184# # MSA functions
83185
@@ -107,6 +209,10 @@ function filter_species!(msa, speciesname::AbstractString)
107209 mask = map (x -> species (x) == speciesname, sequencekeys (msa))
108210 subseqs! (msa, mask)
109211end
212+ function filter_species! (msa:: AbstractVector{FASTX.FASTA.Record} , speciesname:: AbstractString )
213+ mask = map (x -> species (x) == speciesname, identifier .(msa))
214+ subseqs! (msa, mask)
215+ end
110216
111217"""
112218 filter_long!(msa, minres::Real)
0 commit comments