Skip to content

Commit 24d60c4

Browse files
authored
Improve support for multiple MSA formats (#44)
This enhances support for BioStockholm and adds support for aligned FASTA as MSA formats. It also adds a few enhancements for structures encoded in multiple formats (e.g., PDB or CIF) and supports flexibility in the return format from querying the EBI Proteins API (e.g., to download files in FASTA format).
1 parent 81144bb commit 24d60c4

14 files changed

+257
-97
lines changed

Project.toml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
name = "GPCRAnalysis"
22
uuid = "c1d73f9e-d42a-418a-8d5b-c7b00ec0358f"
3-
version = "0.6.0"
3+
version = "0.6.1"
44
authors = ["Tim Holy <[email protected]> and contributors"]
55

66
[deps]

ext/GPCRAnalysisBioStockholmExt.jl

Lines changed: 33 additions & 53 deletions
Original file line numberDiff line numberDiff line change
@@ -5,17 +5,45 @@ using BioStockholm
55
using BioStockholm: OrderedDict # from OrderedCollections.jl
66

77
function conscols(msa::MSA)
8-
ss = msa.GC["SS_cons"]
9-
return findfirst(!=( '.'), ss):findlast(!=( '.'), ss)
8+
if length(msa.GR) == 1
9+
# Fast-path: use the reference sequence
10+
key, _ = only(msa.GR)
11+
s = msa.seq[key]
12+
return findall(s) do c
13+
c == '-' || isuppercase(c)
14+
end
15+
end
16+
# Slow path: check each sequence, find all that have at least one uppercase in that column
17+
keep = falses(length(msa.GC["seq_cons"]))
18+
for (_, s) in msa.seq
19+
keep .|= isuppercase.(s)
20+
end
21+
return findall(keep)
1022
end
1123

1224
# Low-level API implementation
13-
# GPCRAnalysis.sequenceindexes(msaseq::AnnotatedAlignedSequence) = getsequencemapping(msaseq)
14-
# GPCRAnalysis.sequenceindexes(msaseq::MSA, i::Int) = getsequencemapping(msaseq, i)
25+
GPCRAnalysis.sequenceindexes(msa::MSA, i::Int) = GPCRAnalysis.sequenceindexes(msa::MSA, MSACode(GPCRAnalysis.sequencekeys(msa)[i]))
26+
function GPCRAnalysis.sequenceindexes(msa::MSA, key::MSACode)
27+
# seq = GPCRAnalysis.msasequence(msa, key)
28+
seq = msa.seq[String(key)]
29+
offset = findfirst(!=('.'), seq)
30+
filled = [r != '-' for r in seq]
31+
cf = cumsum(filled)
32+
keepcols = conscols(msa)
33+
m = match(r"/(\d+)-(\d+)$", String(key))
34+
if m !== nothing
35+
start, stop = parse.(Int, m.captures)
36+
Δ = start - offset
37+
return (filled .* (cf .+ Δ))[keepcols]
38+
end
39+
return (filled .* cf)[keepcols]
40+
end
1541
GPCRAnalysis.sequencekeys(msa::MSA) = collect(keys(msa.seq))
16-
GPCRAnalysis.msasequence(msa::MSA, key) = msa.seq[key][conscols(msa)]
42+
GPCRAnalysis.msasequence(msa::MSA, key::MSACode) = msa.seq[String(key)][conscols(msa)]
43+
GPCRAnalysis.msasequence(msa::MSA, key::AbstractString) = GPCRAnalysis.msasequence(msa, MSACode(key))
1744
function GPCRAnalysis.residuematrix(msa::MSA)
1845
keepcols = conscols(msa)
46+
# keepcols = Colon()
1947
reduce(vcat, [permutedims(seq[keepcols]) for (_, seq) in msa.seq])
2048
end
2149
GPCRAnalysis.subseqs(msa::MSA{T}, rowmask::AbstractVector{Bool}) where T = MSA{T}(OrderedDict(pr for (pr, keep) in zip(msa.seq, rowmask) if keep), msa.GF, OrderedDict(pr for (pr, keep) in zip(msa.GS, rowmask) if keep), msa.GC, msa.GR)
@@ -48,52 +76,4 @@ end
4876
GPCRAnalysis.MSACode(msa::MSA, accession::AccessionCode) = MSACode(msa, accession.name)
4977
GPCRAnalysis.MSACode(::MSA, accession::MSACode) = accession
5078

51-
52-
function reduced_alphabet(r::Char)
53-
if r == '-'
54-
return 0
55-
elseif r in ('A','I','L','M','V')
56-
return 1 # hydrophobic
57-
elseif r in ('N','Q','S','T')
58-
return 2 # polar
59-
elseif r in ('R','H','K')
60-
return 3 # charged
61-
elseif r in ('D','E')
62-
return 4 # charged
63-
elseif r in ('F','W','Y')
64-
return 5 # aromatic
65-
end
66-
offset = findfirst(==(r), ('C','G','P'))
67-
offset === nothing && throw(ArgumentError("Unknown residue '$r'"))
68-
return 5 + offset # special or unknown
69-
end
70-
71-
GPCRAnalysis.columnwise_entropy(msa) = columnwise_entropy(reduced_alphabet, msa)
72-
73-
function GPCRAnalysis.percent_similarity(f, msa::MSA)
74-
# This mimics MIToS's implementation
75-
function pctsim(v1, v2)
76-
same = l = 0
77-
for (a, b) in zip(v1, v2)
78-
a == b == 0 && continue # skip gaps
79-
same += a == b
80-
l += 1
81-
end
82-
return 100 * same / l
83-
end
84-
85-
M = f.(GPCRAnalysis.residuematrix(msa))
86-
n = size(M, 1)
87-
S = zeros(Float64, n, n)
88-
for i in 1:n
89-
for j in i:n
90-
S[i, j] = pctsim(M[i, :], M[j, :])
91-
S[j, i] = S[i, j]
92-
end
93-
end
94-
return S
95-
end
96-
GPCRAnalysis.percent_similarity(msa::MSA) = GPCRAnalysis.percent_similarity(reduced_alphabet, msa)
97-
98-
9979
end

ext/GPCRAnalysisMIToSExt.jl

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -16,11 +16,14 @@ using MIToS.MSA: getsequence, getannotsequence, getsequencemapping, getresidues,
1616

1717
# Low-level API implementation
1818
GPCRAnalysis.sequenceindexes(msaseq::AnnotatedAlignedSequence) = getsequencemapping(msaseq)
19-
GPCRAnalysis.sequenceindexes(msaseq::AbstractMultipleSequenceAlignment, i::Int) = getsequencemapping(msaseq, i)
19+
GPCRAnalysis.sequenceindexes(msa::AbstractMultipleSequenceAlignment, i::Int) = getsequencemapping(msa, i)
20+
GPCRAnalysis.sequenceindexes(msa::AbstractMultipleSequenceAlignment, key::AbstractString) = getsequencemapping(msa, key)
21+
GPCRAnalysis.sequenceindexes(msa::AbstractMultipleSequenceAlignment, key::MSACode) = sequenceindexes(msa, String(key))
2022
GPCRAnalysis.isgap(res::MSA.Residue) = res == GAP
2123
GPCRAnalysis.isunknown(res::MSA.Residue) = res == XAA
2224
GPCRAnalysis.sequencekeys(msa::AbstractMultipleSequenceAlignment) = sequencenames(msa)
23-
GPCRAnalysis.msasequence(msa::AbstractMultipleSequenceAlignment, key) = getsequence(msa, key)
25+
GPCRAnalysis.msasequence(msa::AbstractMultipleSequenceAlignment, key::AbstractString) = getsequence(msa, key)
26+
GPCRAnalysis.msasequence(msa::AbstractMultipleSequenceAlignment, key::MSACode) = msasequence(msa, String(key))
2427
GPCRAnalysis.residuematrix(msa::AbstractMultipleSequenceAlignment) = getresidues(msa)
2528
GPCRAnalysis.subseqs(msa::AbstractMultipleSequenceAlignment, rowmask) = filtersequences(msa, rowmask)
2629
GPCRAnalysis.subseqs!(msa::AbstractMultipleSequenceAlignment, rowmask) = filtersequences!(msa, rowmask)

src/GPCRAnalysis.jl

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -37,9 +37,10 @@ const StructureLike = Union{ChainLike, Model, MolecularStructure}
3737
# export @res_str
3838

3939
export SequenceMapping, AccessionCode, MSACode, NWGapCosts
40+
export sequenceindexes, columnindexes, isgap, isunknown, sequencekeys, msasequence, residuematrix, subseqs, subseqs!
4041
export species, uniprotX, query_uniprot_accession, query_ebi_proteins, query_ncbi
4142
export try_download_alphafold, query_alphafold_latest, download_alphafolds, alphafoldfile, alphafoldfiles, getchain,
42-
findall_subseq, pLDDT, pLDDTcolor
43+
writechain, findall_subseq, pLDDT, pLDDTcolor
4344
export align_to_axes, align_to_membrane, align_nw, align_ranges, map_closest, align_closest
4445
export filter_species!, filter_long!, sortperm_msa, chimerax_script
4546
export project_sequences, columnwise_entropy, align, residue_centroid, residue_centroid_matrix, alphacarbon_coordinates,

src/alphafold.jl

Lines changed: 8 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
# Generates 2 captures, one for the uniprotXname and the other for the version
2-
const rex_alphafold_pdbs = r"AF-([OPQ][0-9][A-Z0-9]{3}[0-9]|[A-NR-Z][0-9](?:[A-Z][A-Z0-9]{2}[0-9]){1,2})-F1-model_v(\d+).(?:pdb|cif|bcif)"
2+
const rex_alphafold_pdbs = r"AF-([OPQ][0-9][A-Z0-9]{3}[0-9]|[A-NR-Z][0-9](?:[A-Z][A-Z0-9]{2}[0-9]){1,2})-F1-model_v(\d+)(?:_[A-Z])?(?:\.(?:pdb|cif|bcif))?"
33
# Make a regex for a specific uniprotXname (single capture for the version)
44
regex_alphafold_pdb(uniprotXname) = Regex("AF-$uniprotXname-F1-model_v(\\d+).(?:pdb|cif|bcif)")
55

@@ -62,12 +62,14 @@ function alphafoldfiles(msa, dirname::AbstractString=pwd(); join::Bool=false)
6262
ac = AccessionCode(match(rex_alphafold_pdbs, af).captures[1])
6363
accesscode2idx[ac] = i
6464
end
65-
msacode2structfile = Dict{MSACode,String}()
65+
K = typeof(first(sequencekeys(msa)))
66+
msacode2structfile = Dict{K<:AbstractString ? MSACode : K,String}()
6667
for name in sequencekeys(msa)
6768
ac = AccessionCode(msa, name)
6869
if haskey(accesscode2idx, ac)
6970
fn = afs[accesscode2idx[ac]]
70-
msacode2structfile[MSACode(name)] = join ? joinpath(dirname, fn) : fn
71+
mc = isa(name, AbstractString) ? MSACode(name) : name
72+
msacode2structfile[mc] = join ? joinpath(dirname, fn) : fn
7173
end
7274
end
7375
return msacode2structfile
@@ -130,23 +132,23 @@ function download_alphafolds(msa; dirname=pwd(), maxversion=nothing, kwargs...)
130132
maxversion === nothing || @warn "`download_alphafolds`: `maxversion` kwarg has no effect and is deprecated" maxlog=1
131133
@showprogress 1 "Downloading AlphaFold files..." for name in sequencekeys(msa)
132134
uname = AccessionCode(msa, name)
133-
url = query_alphafold_latest(uname)
135+
url = query_alphafold_latest(uname; kwargs...)
134136
url === nothing && continue
135137
fn = split(url, '/')[end]
136138
path = joinpath(dirname, fn)
137139
if !isfile(path)
138140
Downloads.download(url, path)
139141
end
140142
if !validate_seq_residues(msasequence(msa, name), getchain(path))
141-
@warn "Residues in $path do not match those in the sequence $name, removing PDB file"
143+
@warn "Residues in $path do not match those in the sequence $name, removing structure file"
142144
rm(path)
143145
end
144146
end
145147
end
146148

147149
function download_alphafolds(ids::AbstractVector{<:AbstractString}; dirname=pwd(), kwargs...)
148150
@showprogress 1 "Downloading AlphaFold files..." for uname in ids
149-
url = query_alphafold_latest(uname)
151+
url = query_alphafold_latest(uname; kwargs...)
150152
url === nothing && continue
151153
fn = split(url, '/')[end]
152154
path = joinpath(dirname, fn)

src/analyze.jl

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -31,8 +31,6 @@ end
3131
columnwise_entropy(f, msa)
3232
3333
Compute the entropy of each column in an MSA, after applying `f` to each residue. Low entropy indicates high conservation.
34-
35-
Unmatched entries (`'-'` residues) contribute to the entropy calculation as if they were an ordinary residue.
3634
"""
3735
function columnwise_entropy(f, msa)
3836
resnum = map(f, residuematrix(msa))

src/msa.jl

Lines changed: 107 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -13,13 +13,42 @@ Return the corresponding index within the full sequence for each position in `ms
1313
The two-argument form retrieves the sequenceindexes for the `i`th sequence in `msa`.
1414
"""
1515
function sequenceindexes end
16+
sequenceindexes(msa::AbstractVector{FASTX.FASTA.Record}, i::Int) = sequenceindexes(msa[i], columnindexes(msa))
17+
function sequenceindexes(seq::FASTX.FASTA.Record, keepcols::AbstractVector{Int})
18+
s = collect(sequence(seq))
19+
idx = findfirst(islowercase, s)
20+
offset = idx === nothing ? first(keepcols) : idx
21+
preambleidx = first(keepcols)
22+
filled = map(eachindex(s)) do j
23+
j < preambleidx || isuppercase(s[j])
24+
end
25+
cf = cumsum(filled)
26+
m = match(r"/(\d+)-(\d+)$", identifier(seq))
27+
if m !== nothing
28+
start, stop = parse.(Int, m.captures)
29+
Δ = start - offset
30+
return (filled .* (cf .+ Δ))[keepcols]
31+
end
32+
return (filled .* cf)[keepcols]
33+
end
1634

1735
"""
1836
idxs = columnindexes(msa)
1937
20-
Return the indices (within the reference sequence) covered by the conserved columns of the MSA.
38+
Return the indices of the conserved columns of the MSA.
2139
"""
2240
function columnindexes end
41+
function columnindexes(msa::AbstractVector{FASTX.FASTA.Record})
42+
# Slow path: check each sequence, find all that have at least one uppercase in that column
43+
nseq = length(msa)
44+
ncol = length(sequence(msa[1]))
45+
keep = falses(ncol)
46+
for rec in msa
47+
s = collect(sequence(rec))
48+
keep .|= isuppercase.(s)
49+
end
50+
return findall(keep)
51+
end
2352

2453
"""
2554
isgap(res)
@@ -28,6 +57,7 @@ Return `true` if the residue `res` is a gap.
2857
"""
2958
function isgap end
3059
isgap(c::Char) = c == '-'
60+
isgap(r::Integer) = iszero(r) # when mapped to integers, gaps are encoded as 0
3161

3262
"""
3363
isunknown(res)
@@ -43,20 +73,26 @@ isunknown(c::Char) = c == 'X'
4373
Return the keys (sequence names) of the MSA.
4474
"""
4575
function sequencekeys end
76+
sequencekeys(msa::AbstractVector{FASTX.FASTA.Record}) = eachindex(msa)
4677

4778
"""
4879
seq = msasequence(msa, key)
4980
5081
Return the aligned sequence corresponding to `key`.
5182
"""
5283
function msasequence end
84+
msasequence(msa::AbstractVector{FASTX.FASTA.Record}, key::Int) = sequence(msa[key])
5385

5486
"""
5587
R = residuematrix(msa)
5688
5789
Get all residues in the MSA as a matrix, one sequence per row.
5890
"""
5991
function residuematrix end
92+
function residuematrix(msa::AbstractVector{FASTX.FASTA.Record})
93+
M = reduce(vcat, [permutedims(collect(sequence(rec))) for rec in msa])
94+
return M[:, columnindexes(msa)]
95+
end
6096

6197
"""
6298
msaview = subseqs(msa, rowindexes::AbstractVector{Int})
@@ -70,14 +106,80 @@ Construct a reduced-size `msaview`, keeping only the sequences corresponding to
70106
function subseqs end
71107
function subseqs! end
72108

109+
subseqs(msa::AbstractVector{FASTX.FASTA.Record}, rowmask) = msa[rowmask]
110+
subseqs!(msa::AbstractVector{FASTX.FASTA.Record}, rowmask::AbstractVector{Bool}) =
111+
deleteat!(msa, findall(!, rowmask))
112+
subseqs!(msa::AbstractVector{FASTX.FASTA.Record}, rowindexes::AbstractVector{Int}) =
113+
deleteat!(msa, setdiff(1:length(msa), rowindexes))
114+
115+
## End required API, but some can specialize other methods
116+
73117
"""
74118
pc = percent_similarity(msa)
119+
pc = percent_similarity(f, msa)
75120
76121
Compute the percent similarity between all pairs of sequences in `msa`.
77122
`pc[i, j]` is the percent similarity between sequences `i` and `j`.
123+
124+
Optionally apply mapping function `f` to each residue before computing
125+
similarity.
78126
"""
79127
function percent_similarity end
80128

129+
function percent_similarity(f, msa)
130+
# This mimics MIToS's implementation
131+
function pctsim(v1, v2)
132+
same = l = 0
133+
for (a, b) in zip(v1, v2)
134+
isgap(a) && isgap(b) && continue # skip gaps
135+
same += a == b
136+
l += 1
137+
end
138+
return 100 * same / l
139+
end
140+
141+
M = f.(residuematrix(msa))
142+
n = size(M, 1)
143+
S = zeros(Float64, n, n)
144+
for i in 1:n
145+
for j in i:n
146+
S[i, j] = pctsim(M[i, :], M[j, :])
147+
S[j, i] = S[i, j]
148+
end
149+
end
150+
return S
151+
end
152+
percent_similarity(msa) = percent_similarity(reduced_alphabet, msa)
153+
154+
function reduced_alphabet(r::Char)
155+
if r == '-'
156+
return 0
157+
elseif r in ('A','I','L','M','V')
158+
return 1 # hydrophobic
159+
elseif r in ('N','Q','S','T')
160+
return 2 # polar
161+
elseif r in ('R','H','K')
162+
return 3 # charged
163+
elseif r in ('D','E')
164+
return 4 # charged
165+
elseif r in ('F','W','Y')
166+
return 5 # aromatic
167+
end
168+
offset = findfirst(==(r), ('C','G','P'))
169+
offset === nothing && throw(ArgumentError("Unknown residue '$r'"))
170+
return 5 + offset # special or unknown
171+
end
172+
173+
columnwise_entropy(msa) = columnwise_entropy(reduced_alphabet, msa)
174+
175+
# Notes on interpreting letter codes in the "GC.seq_cons" field:
176+
# - `.` indicates a gap
177+
# - uppercase single-letter amino acid codes indicate strong consensus (>60%)
178+
# - lowercase single-letter codes (likely interpretations):
179+
# + 'a': aromatic
180+
# + 'h': hydrophobic
181+
# + rest unknown
182+
# - '+' and '-' indicate positively- and negatively-charged residues, respectively
81183

82184
## MSA functions
83185

@@ -107,6 +209,10 @@ function filter_species!(msa, speciesname::AbstractString)
107209
mask = map(x -> species(x) == speciesname, sequencekeys(msa))
108210
subseqs!(msa, mask)
109211
end
212+
function filter_species!(msa::AbstractVector{FASTX.FASTA.Record}, speciesname::AbstractString)
213+
mask = map(x -> species(x) == speciesname, identifier.(msa))
214+
subseqs!(msa, mask)
215+
end
110216

111217
"""
112218
filter_long!(msa, minres::Real)

0 commit comments

Comments
 (0)