Skip to content

Commit 360ecaf

Browse files
committed
Move MIToS support to an extension
BioStockholm is much faster than MIToS at parsing large files, and it would be nice to be able to support it. As a first step, move MIToS support to an extension.
1 parent 3b3996f commit 360ecaf

File tree

15 files changed

+646
-201
lines changed

15 files changed

+646
-201
lines changed

Project.toml

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -17,7 +17,6 @@ Hungarian = "e91730f6-4275-51fb-a7a0-7064cfbd3b39"
1717
Interpolations = "a98d9a8b-a2ab-59e6-89dd-64a1c18fca59"
1818
JSON3 = "0f8b85d8-7281-11e9-16c2-39a750bddbf1"
1919
LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e"
20-
MIToS = "51bafb47-8a16-5ded-8b04-24ef4eede0b5"
2120
MultivariateStats = "6f286f6a-111f-5878-ab1e-185364afe411"
2221
MutableConvexHulls = "948c7aac-0e5e-4631-af23-7a6bb7a17825"
2322
OffsetArrays = "6fe1bfb0-de20-5000-8ca7-80f57d26f881"
@@ -30,9 +29,11 @@ TravelingSalesmanHeuristics = "8c8f4381-2cdd-507c-846c-be2bcff6f45f"
3029
[weakdeps]
3130
HiGHS = "87dc4568-4c63-4d18-b0c0-bb2238e4078b"
3231
JuMP = "4076af6c-e467-56ae-b986-b466b2749572"
32+
MIToS = "51bafb47-8a16-5ded-8b04-24ef4eede0b5"
3333

3434
[extensions]
3535
GPCRAnalysisJuMPExt = ["JuMP", "HiGHS"]
36+
GPCRAnalysisMIToSExt = "MIToS"
3637

3738
[compat]
3839
BioStructures = "4.2"
@@ -65,7 +66,8 @@ julia = "1.10"
6566
HiGHS = "87dc4568-4c63-4d18-b0c0-bb2238e4078b"
6667
InvertedIndices = "41ab1584-1d38-5bbf-9106-f11c6c58b48f"
6768
JuMP = "4076af6c-e467-56ae-b986-b466b2749572"
69+
MIToS = "51bafb47-8a16-5ded-8b04-24ef4eede0b5"
6870
Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40"
6971

7072
[targets]
71-
test = ["HiGHS", "InvertedIndices", "JuMP", "Test"]
73+
test = ["HiGHS", "InvertedIndices", "JuMP", "MIToS", "Test"]

ext/GPCRAnalysisMIToSExt.jl

Lines changed: 85 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,85 @@
1+
module GPCRAnalysisMIToSExt
2+
3+
using GPCRAnalysis
4+
using Downloads
5+
using BioStructures
6+
using ProgressMeter
7+
8+
using GPCRAnalysis: ChainLike, ResidueLike, StructureLike, _entropy, validate_seq_residues, rex_alphafold_pdbs
9+
10+
using MIToS: MIToS, Pfam, MSA
11+
using MIToS.MSA: AbstractMultipleSequenceAlignment, AnnotatedAlignedSequence, AnnotatedMultipleSequenceAlignment,
12+
ReducedAlphabet, ResidueAlphabet, GAP, XAA
13+
using MIToS.MSA: getsequence, getannotsequence, getsequencemapping, getresidues, three2residue, sequencenames,
14+
filtersequences, filtersequences!, percentsimilarity
15+
16+
17+
# Low-level API implementation
18+
GPCRAnalysis.sequenceindexes(msaseq::AnnotatedAlignedSequence) = getsequencemapping(msaseq)
19+
GPCRAnalysis.sequenceindexes(msaseq::AbstractMultipleSequenceAlignment, i::Int) = getsequencemapping(msaseq, i)
20+
GPCRAnalysis.isgap(res::MSA.Residue) = res == GAP
21+
GPCRAnalysis.isunknown(res::MSA.Residue) = res == XAA
22+
GPCRAnalysis.sequencekeys(msa::AbstractMultipleSequenceAlignment) = sequencenames(msa)
23+
GPCRAnalysis.msasequence(msa::AbstractMultipleSequenceAlignment, key) = getsequence(msa, key)
24+
GPCRAnalysis.residuematrix(msa::AbstractMultipleSequenceAlignment) = getresidues(msa)
25+
GPCRAnalysis.subseqs(msa::AbstractMultipleSequenceAlignment, rowmask::AbstractVector{Bool}) = filtersequences(msa, rowmask)
26+
GPCRAnalysis.subseqs!(msa::AbstractMultipleSequenceAlignment, rowmask::AbstractVector{Bool}) = filtersequences!(msa, rowmask)
27+
function GPCRAnalysis.subseqs(msa::AbstractMultipleSequenceAlignment, rowindexes::AbstractVector{Int})
28+
rowmask = falses(nsequences(msa))
29+
rowmask[rowindexes] .= true
30+
return subseqs(msa, rowmask)
31+
end
32+
function GPCRAnalysis.subseqs!(msa::AbstractMultipleSequenceAlignment, rowindexes::AbstractVector{Int})
33+
rowmask = falses(nsequences(msa))
34+
rowmask[rowindexes] .= true
35+
return subseqs!(msa, rowmask)
36+
end
37+
GPCRAnalysis.percent_similarity(msa::AbstractMultipleSequenceAlignment) = percentsimilarity(msa)
38+
39+
Base.getindex(msa::AbstractMultipleSequenceAlignment, seqname::MSACode) = getsequence(msa, seqname.name)
40+
Base.getindex(msa::AbstractMultipleSequenceAlignment, seqname::AccessionCode) = getsequence(msa, MSACode(msa, seqname).name)
41+
42+
function GPCRAnalysis.AccessionCode(msa::AnnotatedMultipleSequenceAlignment, seqname::AbstractString)
43+
AccessionCode(uniprotX(getannotsequence(msa, seqname, "AC", seqname)))
44+
end
45+
GPCRAnalysis.AccessionCode(msa::AnnotatedMultipleSequenceAlignment, seqname::MSACode) = AccessionCode(msa, seqname.name)
46+
GPCRAnalysis.AccessionCode(::AnnotatedMultipleSequenceAlignment, seqname::AccessionCode) = seqname
47+
48+
function GPCRAnalysis.MSACode(msa::AnnotatedMultipleSequenceAlignment, accession::AbstractString)
49+
seqnames = sequencenames(msa)
50+
return MSACode(seqnames[findfirst(x -> AccessionCode(msa, x).name == accession, seqnames)])
51+
end
52+
GPCRAnalysis.MSACode(msa::AnnotatedMultipleSequenceAlignment, accession::AccessionCode) = MSACode(msa, accession.name)
53+
GPCRAnalysis.MSACode(::AnnotatedMultipleSequenceAlignment, accession::MSACode) = accession
54+
55+
GPCRAnalysis.SequenceMapping(seq::AnnotatedAlignedSequence) = SequenceMapping(getsequencemapping(seq))
56+
57+
# Move this to MIToS?
58+
if !hasmethod(getsequencemapping, Tuple{AnnotatedAlignedSequence})
59+
function MIToS.MSA.getsequencemapping(seq::AnnotatedAlignedSequence)
60+
getsequencemapping(seq, sequencenames(seq)[1])
61+
end
62+
function MIToS.MSA.getsequencemapping(msa::Union{AnnotatedAlignedSequence,AnnotatedMultipleSequenceAlignment}, seq_id::String)
63+
MIToS.MSA._str2int_mapping(getannotsequence(msa, seq_id, "SeqMap"))
64+
end
65+
function MIToS.MSA.getsequencemapping(msa::AnnotatedMultipleSequenceAlignment, seqid::Regex)
66+
id = findfirst(str -> occursin(seqid, str), sequencenames(msa))
67+
getsequencemapping(msa, id)
68+
end
69+
end
70+
71+
const reduced_code = ReducedAlphabet("(AILMV)(NQST)(RHK)(DE)(FWY)CGP")
72+
73+
"""
74+
columnwise_entropy(msa, aacode = reduced_code)
75+
76+
Call `columnwise_entropy` after mapping each residue through `aacode`.
77+
78+
The default code is `ReducedAlphabet("(AILMV)(NQST)(RHK)(DE)(FWY)CGP")`, which
79+
groups residues into categories hydrophobic, polar, charged, aromatic, and
80+
"special."
81+
"""
82+
GPCRAnalysis.columnwise_entropy(msa::AbstractMultipleSequenceAlignment, aacode::ResidueAlphabet=reduced_code) =
83+
GPCRAnalysis.columnwise_entropy(r -> aacode[r], msa)
84+
85+
end

src/GPCRAnalysis.jl

Lines changed: 4 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -5,15 +5,9 @@ using Statistics
55
using LinearAlgebra
66

77
using BioStructures
8+
using BioStructures: amino_acid_data
89
using FASTX
910

10-
using MIToS: MIToS, Pfam, MSA
11-
using MIToS.MSA: AbstractMultipleSequenceAlignment, AnnotatedAlignedSequence, AnnotatedMultipleSequenceAlignment,
12-
ReducedAlphabet, GAP, XAA
13-
using MIToS.MSA: getsequence, getannotsequence, getsequencemapping, getresidues, three2residue, sequencenames,
14-
filtersequences!, percentsimilarity
15-
using MIToS.PDB: vanderwaalsradius, ishydrophobic, isaromatic, iscationic, isanionic, ishbonddonor, ishbondacceptor
16-
1711
using MultivariateStats
1812
using Distances
1913
using OffsetArrays
@@ -57,9 +51,11 @@ export sidechaincentroid, scvector, inward_tm_residues, inward_ecl_residues
5751
export features_from_structure
5852
export forcecomponents, optimize_weights, forcedict
5953

54+
include("consts.jl")
6055
include("naming_conventions.jl")
61-
include("utils.jl")
6256
include("msa.jl")
57+
include("utils.jl")
58+
include("query.jl")
6359
include("alphafold.jl")
6460
include("analyze.jl")
6561
include("tmalign.jl")

src/align.jl

Lines changed: 7 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -279,22 +279,22 @@ end
279279
"""
280280
seqtms = align_ranges(seq1, seq2, seq2ranges::AbstractVector{<:AbstractUnitRange})
281281
282-
Transfer `refranges`, a list of reside index spans in `seq2`, to `seq1`. `seq1` and
282+
Transfer `seq2ranges`, a list of reside index spans in `seq2`, to `seq1`. `seq1` and
283283
`seq2` must be spatially aligned, and the assignment is made by minimizing
284284
inter-chain distance subject to the constraint of preserving sequence order.
285285
"""
286-
function align_ranges(seq1::ChainLike, seq2::AbstractVector{<:AbstractResidue}, refranges::AbstractVector{<:AbstractUnitRange}; kwargs...)
287-
anchoridxs = sizehint!(Int[], length(refranges)*2)
288-
for r in refranges
286+
function align_ranges(seq1::ChainLike, seq2::AbstractVector{<:AbstractResidue}, seq2ranges::AbstractVector{<:AbstractUnitRange}; kwargs...)
287+
anchoridxs = sizehint!(Int[], length(seq2ranges)*2)
288+
for r in seq2ranges
289289
push!(anchoridxs, first(r), last(r))
290290
end
291-
issorted(anchoridxs) || throw(ArgumentError("`refranges` must be strictly increasing spans, got $refranges"))
291+
issorted(anchoridxs) || throw(ArgumentError("`seq2ranges` must be strictly increasing spans, got $seq2ranges"))
292292
ϕ = align_nw(seq1, seq2[anchoridxs], NWGapCosts{Float64}(open1=Inf); kwargs...)
293293
@assert last.(ϕ) == eachindex(anchoridxs)
294294
return [ϕ[i][1]:ϕ[i+1][1] for i in 1:2:length(ϕ)]
295295
end
296-
align_ranges(seq1::ChainLike, seq2::Chain, refranges::AbstractVector{<:AbstractUnitRange}; kwargs...) =
297-
align_ranges(seq1, collectresidues(seq2), refranges; kwargs...)
296+
align_ranges(seq1::ChainLike, seq2::Chain, seq2ranges::AbstractVector{<:AbstractUnitRange}; kwargs...) =
297+
align_ranges(seq1, collectresidues(seq2), seq2ranges; kwargs...)
298298

299299
function score_nw(D::AbstractMatrix, gapcosts::NWGapCosts)
300300
Base.require_one_based_indexing(D)

src/alphafold.jl

Lines changed: 14 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -32,7 +32,7 @@ end
3232
Return the latest version of all AlphaFold files in `dirname`.
3333
If `join` is `true`, then the full paths are returned.
3434
"""
35-
function alphafoldfiles(dirname=pwd(); join::Bool=false)
35+
function alphafoldfiles(dirname::AbstractString=pwd(); join::Bool=false)
3636
latest = Dict{String,Int}()
3737
latestfn = Dict{String,String}()
3838
for fn in readdir(dirname)
@@ -51,19 +51,19 @@ function alphafoldfiles(dirname=pwd(); join::Bool=false)
5151
end
5252

5353
"""
54-
msacode2structfile = alphafoldfiles(msa::AnnotatedMultipleSequenceAlignment, dirname=pwd())
54+
msacode2structfile = alphafoldfiles(msa, dirname=pwd())
5555
5656
Return a dictionary mapping `MSACode`s to the corresponding AlphaFold structure files.
5757
"""
58-
function alphafoldfiles(msa::AnnotatedMultipleSequenceAlignment, dirname=pwd(); join::Bool=false)
58+
function alphafoldfiles(msa, dirname::AbstractString=pwd(); join::Bool=false)
5959
afs = alphafoldfiles(dirname)
6060
accesscode2idx = Dict{AccessionCode,Int}()
6161
for (i, af) in pairs(afs)
6262
ac = AccessionCode(match(rex_alphafold_pdbs, af).captures[1])
6363
accesscode2idx[ac] = i
6464
end
6565
msacode2structfile = Dict{MSACode,String}()
66-
for name in sequencenames(msa)
66+
for name in sequencekeys(msa)
6767
ac = AccessionCode(msa, name)
6868
if haskey(accesscode2idx, ac)
6969
fn = afs[accesscode2idx[ac]]
@@ -116,18 +116,19 @@ function try_download_alphafold(uniprotXname::AbstractString, path::AbstractStri
116116
end
117117

118118
"""
119-
download_alphafolds(msa::AbstractMultipleSequenceAlignment; dirname=pwd())
119+
download_alphafolds(msa; dirname=pwd())
120120
download_alphafolds(ids; dirname=pwd())
121121
122-
Download all available [AlphaFold](https://alphafold.com/) structures for the sequences in `msa`.
123-
Missing entries are silently skipped.
122+
Download all available [AlphaFold](https://alphafold.com/) structures for the
123+
sequences in `msa`. Missing entries are silently skipped.
124124
125-
If a `AbstractMultipleSequenceAlignment` is provided, the downloaded PDB file is checked to ensure that
126-
the residues in the MSA sequence match those in the PDB file. If they do not match, the PDB file is removed.
125+
If an `msa` is provided, each downloaded PDB file is checked to ensure that the
126+
residues in the MSA sequence match those in the PDB file. If they do not match,
127+
the PDB file is removed.
127128
"""
128-
function download_alphafolds(msa::AbstractMultipleSequenceAlignment; dirname=pwd(), maxversion=nothing, kwargs...)
129+
function download_alphafolds(msa; dirname=pwd(), maxversion=nothing, kwargs...)
129130
maxversion === nothing || @warn "`download_alphafolds`: `maxversion` kwarg has no effect and is deprecated" maxlog=1
130-
@showprogress 1 "Downloading AlphaFold files..." for name in sequencenames(msa)
131+
@showprogress 1 "Downloading AlphaFold files..." for name in sequencekeys(msa)
131132
uname = AccessionCode(msa, name)
132133
url = query_alphafold_latest(uname)
133134
url === nothing && continue
@@ -136,14 +137,14 @@ function download_alphafolds(msa::AbstractMultipleSequenceAlignment; dirname=pwd
136137
if !isfile(path)
137138
Downloads.download(url, path)
138139
end
139-
if !validate_seq_residues(getsequence(msa, name), getchain(path))
140+
if !validate_seq_residues(msasequence(msa, name), getchain(path))
140141
@warn "Residues in $path do not match those in the sequence $name, removing PDB file"
141142
rm(path)
142143
end
143144
end
144145
end
145146

146-
function download_alphafolds(ids; dirname=pwd(), kwargs...)
147+
function download_alphafolds(ids::AbstractVector{<:AbstractString}; dirname=pwd(), kwargs...)
147148
@showprogress 1 "Downloading AlphaFold files..." for uname in ids
148149
url = query_alphafold_latest(uname)
149150
url === nothing && continue

src/analyze.jl

Lines changed: 7 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -1,12 +1,12 @@
11
"""
2-
X = project_sequences(msa::AbstractMultipleSequenceAlignment; fracvar::Real = 0.9)
2+
X = project_sequences(msa; fracvar::Real = 0.9)
33
44
Perform a classical multidimensional scaling analysis to project the sequences in `msa` to a space
55
in which pairwise distances approximately reproduce `100 - percentsimilarity(seq1, seq2)`.
66
The dimensionality is chosen to reconstruction `fracvar` of the variance.
77
"""
8-
function project_sequences(msa::AbstractMultipleSequenceAlignment; fracvar::Real = 0.9)
9-
sim = percentsimilarity(msa)
8+
function project_sequences(msa; fracvar::Real = 0.9)
9+
sim = percent_similarity(msa)
1010
D = 100 .- Matrix(sim)
1111
f = fit(MDS, D; distances=true)
1212
# Capture sufficient variance
@@ -17,8 +17,6 @@ function project_sequences(msa::AbstractMultipleSequenceAlignment; fracvar::Real
1717
return X[1:nd, :]
1818
end
1919

20-
const reduced_code = ReducedAlphabet("(AILMV)(NQST)(RHK)(DE)(FWY)CGP")
21-
2220
function _entropy(v)
2321
count = countitems(v)
2422
e = 0.0
@@ -30,14 +28,14 @@ function _entropy(v)
3028
end
3129

3230
"""
33-
columnwise_entropy(msa::AbstractMultipleSequenceAlignment, aacode = ReducedAlphabet("(AILMV)(NQST)(RHK)(DE)(FWY)CGP"))
31+
columnwise_entropy(f, msa)
3432
35-
Compute the entropy of each column in an MSA. Low entropy indicates high conservation.
33+
Compute the entropy of each column in an MSA, after applying `f` to each residue. Low entropy indicates high conservation.
3634
3735
Unmatched entries (`'-'` residues) contribute to the entropy calculation as if they were an ordinary residue.
3836
"""
39-
function columnwise_entropy(msa::AbstractMultipleSequenceAlignment, aacode=reduced_code)
40-
resnum = map(r -> aacode[r], getresidues(msa))
37+
function columnwise_entropy(f, msa)
38+
resnum = map(f, residuematrix(msa))
4139
return map(_entropy, eachcol(resnum))
4240
end
4341

src/chimerax.jl

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -32,7 +32,7 @@ function chimerax_script(scriptfilename, struct_filenames, ridxs::AbstractVector
3232
end
3333

3434
"""
35-
chimerax_script(scriptfilename, uprot_list, msa::AnnotatedMultipleSequenceAlignment, colidxs;
35+
chimerax_script(scriptfilename, uprot_list, msa, colidxs;
3636
dir=pwd(), align=true, chain_transparency=80, styles=Dict{Int,String}(), extras=String[])
3737
3838
Create a [chimerax](https://www.cgl.ucsf.edu/chimerax/) visualization script
@@ -63,17 +63,17 @@ chimerax_script("myscript.cxc", ["P15409"], msa, [i1, i2, i3])
6363
6464
where `i1` through `i3` are column-indices in the `msa` that you'd like to view.
6565
"""
66-
function chimerax_script(scriptfilename, uprot_list, msa::AnnotatedMultipleSequenceAlignment, colidxs;
66+
function chimerax_script(scriptfilename, uprot_list, msa, colidxs;
6767
dir=pwd(), styles=Dict{Int,String}(), kwargs...)
6868
ridxs = [Int[] for _ in 1:length(uprot_list)]
6969
struct_filenames = Vector{String}(undef, length(uprot_list))
7070
rcstyles = Dict{Tuple{Int,Int},String}()
7171
afs = alphafoldfiles(msa, dir; join=true)
72-
uprot2msaidx = Dict{AccessionCode,Int}(AccessionCode(msa, name) => i for (i, name) in enumerate(sequencenames(msa)))
72+
uprot2msaidx = Dict{AccessionCode,Int}(AccessionCode(msa, name) => i for (i, name) in enumerate(sequencekeys(msa)))
7373
for (i, p) in enumerate(uprot_list)
7474
j = uprot2msaidx[AccessionCode(p)]
75-
struct_filenames[i] = afs[MSACode(sequencenames(msa)[j])]
76-
sm = getsequencemapping(msa, j)
75+
struct_filenames[i] = afs[MSACode(sequencekeys(msa)[j])]
76+
sm = sequenceindexes(msa, j)
7777
for (j, c) in enumerate(colidxs)
7878
ridx = sm[c]
7979
if iszero(ridx)

0 commit comments

Comments
 (0)