Skip to content

Commit 66ffb62

Browse files
committed
Move MIToS support to an extension
BioStockholm is much faster than MIToS at parsing large files, and it would be nice to be able to support it. As a first step, move MIToS support to an extension.
1 parent 52db1d9 commit 66ffb62

File tree

14 files changed

+576
-238
lines changed

14 files changed

+576
-238
lines changed

Project.toml

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -17,7 +17,6 @@ Hungarian = "e91730f6-4275-51fb-a7a0-7064cfbd3b39"
1717
Interpolations = "a98d9a8b-a2ab-59e6-89dd-64a1c18fca59"
1818
JSON3 = "0f8b85d8-7281-11e9-16c2-39a750bddbf1"
1919
LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e"
20-
MIToS = "51bafb47-8a16-5ded-8b04-24ef4eede0b5"
2120
MultivariateStats = "6f286f6a-111f-5878-ab1e-185364afe411"
2221
MutableConvexHulls = "948c7aac-0e5e-4631-af23-7a6bb7a17825"
2322
OffsetArrays = "6fe1bfb0-de20-5000-8ca7-80f57d26f881"
@@ -30,9 +29,11 @@ TravelingSalesmanHeuristics = "8c8f4381-2cdd-507c-846c-be2bcff6f45f"
3029
[weakdeps]
3130
HiGHS = "87dc4568-4c63-4d18-b0c0-bb2238e4078b"
3231
JuMP = "4076af6c-e467-56ae-b986-b466b2749572"
32+
MIToS = "51bafb47-8a16-5ded-8b04-24ef4eede0b5"
3333

3434
[extensions]
3535
GPCRAnalysisJuMPExt = ["JuMP", "HiGHS"]
36+
GPCRAnalysisMIToSExt = "MIToS"
3637

3738
[compat]
3839
BioStructures = "4.2"
@@ -65,7 +66,8 @@ julia = "1.10"
6566
HiGHS = "87dc4568-4c63-4d18-b0c0-bb2238e4078b"
6667
InvertedIndices = "41ab1584-1d38-5bbf-9106-f11c6c58b48f"
6768
JuMP = "4076af6c-e467-56ae-b986-b466b2749572"
69+
MIToS = "51bafb47-8a16-5ded-8b04-24ef4eede0b5"
6870
Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40"
6971

7072
[targets]
71-
test = ["HiGHS", "InvertedIndices", "JuMP", "Test"]
73+
test = ["HiGHS", "InvertedIndices", "JuMP", "MIToS", "Test"]

ext/GPCRAnalysisMIToSExt.jl

Lines changed: 171 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,171 @@
1+
module GPCRAnalysisMIToSExt
2+
3+
using GPCRAnalysis
4+
using Downloads
5+
using BioStructures
6+
using ProgressMeter
7+
8+
using GPCRAnalysis: ChainLike, ResidueLike, StructureLike, _entropy, validate_seq_residues, rex_alphafold_pdbs
9+
10+
using MIToS: MIToS, Pfam, MSA
11+
using MIToS.MSA: AbstractMultipleSequenceAlignment, AnnotatedAlignedSequence, AnnotatedMultipleSequenceAlignment,
12+
ReducedAlphabet, GAP, XAA
13+
using MIToS.MSA: getsequence, getannotsequence, getsequencemapping, getresidues, three2residue, sequencenames,
14+
filtersequences!, percentsimilarity
15+
16+
17+
Base.getindex(msa::AbstractMultipleSequenceAlignment, seqname::MSACode) = getsequence(msa, seqname.name)
18+
Base.getindex(msa::AbstractMultipleSequenceAlignment, seqname::AccessionCode) = getsequence(msa, MSACode(msa, seqname).name)
19+
20+
MIToS.PDB.ishydrophobic(a::AbstractAtom, rname::AbstractString) = (rname, atomname(a)) in MIToS.PDB._hydrophobic
21+
MIToS.PDB.isaromatic(a::AbstractAtom, rname::AbstractString) = (rname, atomname(a)) in MIToS.PDB._aromatic
22+
MIToS.PDB.iscationic(a::AbstractAtom, rname::AbstractString) = (rname, atomname(a)) in MIToS.PDB._cationic
23+
MIToS.PDB.isanionic(a::AbstractAtom, rname::AbstractString) = (rname, atomname(a)) in MIToS.PDB._anionic
24+
MIToS.PDB.ishbonddonor(a::AbstractAtom, rname::AbstractString) = (rname, atomname(a)) in keys(MIToS.PDB._hbond_donor)
25+
MIToS.PDB.ishbondacceptor(a::AbstractAtom, rname::AbstractString) = (rname, atomname(a)) in keys(MIToS.PDB._hbond_acceptor)
26+
27+
function GPCRAnalysis.AccessionCode(msa::AnnotatedMultipleSequenceAlignment, seqname::AbstractString)
28+
AccessionCode(uniprotX(getannotsequence(msa, seqname, "AC", seqname)))
29+
end
30+
GPCRAnalysis.AccessionCode(msa::AnnotatedMultipleSequenceAlignment, seqname::MSACode) = AccessionCode(msa, seqname.name)
31+
GPCRAnalysis.AccessionCode(::AnnotatedMultipleSequenceAlignment, seqname::AccessionCode) = seqname
32+
33+
function GPCRAnalysis.MSACode(msa::AnnotatedMultipleSequenceAlignment, accession::AbstractString)
34+
seqnames = sequencenames(msa)
35+
return MSACode(seqnames[findfirst(x -> AccessionCode(msa, x).name == accession, seqnames)])
36+
end
37+
GPCRAnalysis.MSACode(msa::AnnotatedMultipleSequenceAlignment, accession::AccessionCode) = MSACode(msa, accession.name)
38+
GPCRAnalysis.MSACode(::AnnotatedMultipleSequenceAlignment, accession::MSACode) = accession
39+
40+
GPCRAnalysis.SequenceMapping(seq::AnnotatedAlignedSequence) = SequenceMapping(getsequencemapping(seq))
41+
42+
GPCRAnalysis.percent_similarity(msa::AbstractMultipleSequenceAlignment) = percentsimilarity(msa)
43+
44+
# Move this to MIToS?
45+
if !hasmethod(getsequencemapping, Tuple{AnnotatedAlignedSequence})
46+
function MIToS.MSA.getsequencemapping(seq::AnnotatedAlignedSequence)
47+
getsequencemapping(seq, sequencenames(seq)[1])
48+
end
49+
function MIToS.MSA.getsequencemapping(msa::Union{AnnotatedAlignedSequence,AnnotatedMultipleSequenceAlignment}, seq_id::String)
50+
MIToS.MSA._str2int_mapping(getannotsequence(msa, seq_id, "SeqMap"))
51+
end
52+
function MIToS.MSA.getsequencemapping(msa::AnnotatedMultipleSequenceAlignment, seqid::Regex)
53+
id = findfirst(str -> occursin(seqid, str), sequencenames(msa))
54+
getsequencemapping(msa, id)
55+
end
56+
end
57+
58+
function GPCRAnalysis.validate_seq_residues(seq::AnnotatedAlignedSequence, chain)
59+
for (i, r) in zip(getsequencemapping(seq), seq)
60+
(r == GAP || r == XAA) && continue
61+
res = three2residue(String(resname(chain[i])))
62+
res == r || return false
63+
end
64+
return true
65+
end
66+
67+
const reduced_code = ReducedAlphabet("(AILMV)(NQST)(RHK)(DE)(FWY)CGP")
68+
69+
function GPCRAnalysis.columnwise_entropy(msa::AbstractMultipleSequenceAlignment, aacode=reduced_code)
70+
resnum = map(r -> aacode[r], getresidues(msa))
71+
return map(_entropy, eachcol(resnum))
72+
end
73+
74+
function GPCRAnalysis.filter_long!(msa::AbstractMultipleSequenceAlignment, minres::Real)
75+
# Get rid of short sequences
76+
nresidues = map(eachrow(msa)) do v
77+
sum(!=(MSA.Residue('-')), v)
78+
end
79+
mask = nresidues .> minres
80+
filtersequences!(msa, mask)
81+
end
82+
83+
function GPCRAnalysis.aa_properties_matrix(msa::AbstractMultipleSequenceAlignment)
84+
props = copy(aa_properties_zscored)
85+
props[Char(GAP)] = zero(valtype(props))
86+
props['X'] = zero(valtype(props))
87+
return [props[Char(residue)] for residue in permutedims(msa)]
88+
end
89+
90+
function GPCRAnalysis.chimerax_script(scriptfilename, uprot_list, msa::AnnotatedMultipleSequenceAlignment, colidxs;
91+
dir=pwd(), styles=Dict{Int,String}(), kwargs...)
92+
ridxs = [Int[] for _ in 1:length(uprot_list)]
93+
struct_filenames = Vector{String}(undef, length(uprot_list))
94+
rcstyles = Dict{Tuple{Int,Int},String}()
95+
afs = alphafoldfiles(msa, dir; join=true)
96+
uprot2msaidx = Dict{AccessionCode,Int}(AccessionCode(msa, name) => i for (i, name) in enumerate(sequencenames(msa)))
97+
for (i, p) in enumerate(uprot_list)
98+
j = uprot2msaidx[AccessionCode(p)]
99+
struct_filenames[i] = afs[MSACode(sequencenames(msa)[j])]
100+
sm = getsequencemapping(msa, j)
101+
for (j, c) in enumerate(colidxs)
102+
ridx = sm[c]
103+
if iszero(ridx)
104+
@warn "column $c not set in $p"
105+
continue
106+
end
107+
push!(ridxs[i], ridx)
108+
style = get(styles, c, nothing)
109+
if style !== nothing
110+
rcstyles[(i, j)] = style
111+
end
112+
end
113+
end
114+
return chimerax_script(scriptfilename, struct_filenames, ridxs; styles=rcstyles, kwargs...)
115+
end
116+
117+
function GPCRAnalysis.filter_species!(msa::AbstractMultipleSequenceAlignment, speciesname::AbstractString)
118+
mask = map(x -> species(x) == speciesname, sequencenames(msa))
119+
filtersequences!(msa, mask)
120+
end
121+
122+
GPCRAnalysis.gapres(::Type{MSA.Residue}) = MSA.Residue('-')
123+
124+
function GPCRAnalysis.StructAlign(struct1::ChainLike, struct2::ChainLike,
125+
align1::AbstractVector{MSA.Residue}, align2::AbstractVector{MSA.Residue},
126+
quality)
127+
StructAlign(MapAlign(struct1, align1, quality), MapAlign(struct2, align2, quality))
128+
end
129+
130+
"""
131+
msacode2structfile = alphafoldfiles(msa::AnnotatedMultipleSequenceAlignment, dirname=pwd())
132+
133+
Return a dictionary mapping `MSACode`s to the corresponding AlphaFold structure files.
134+
"""
135+
function GPCRAnalysis.alphafoldfiles(msa::AnnotatedMultipleSequenceAlignment, dirname=pwd(); join::Bool=false)
136+
afs = alphafoldfiles(dirname)
137+
accesscode2idx = Dict{AccessionCode,Int}()
138+
for (i, af) in pairs(afs)
139+
ac = AccessionCode(match(rex_alphafold_pdbs, af).captures[1])
140+
accesscode2idx[ac] = i
141+
end
142+
msacode2structfile = Dict{MSACode,String}()
143+
for name in sequencenames(msa)
144+
ac = AccessionCode(msa, name)
145+
if haskey(accesscode2idx, ac)
146+
fn = afs[accesscode2idx[ac]]
147+
msacode2structfile[MSACode(name)] = join ? joinpath(dirname, fn) : fn
148+
end
149+
end
150+
return msacode2structfile
151+
end
152+
153+
function GPCRAnalysis.download_alphafolds(msa::AbstractMultipleSequenceAlignment; dirname=pwd(), maxversion=nothing, kwargs...)
154+
maxversion === nothing || @warn "`download_alphafolds`: `maxversion` kwarg has no effect and is deprecated" maxlog=1
155+
@showprogress 1 "Downloading AlphaFold files..." for name in sequencenames(msa)
156+
uname = AccessionCode(msa, name)
157+
url = query_alphafold_latest(uname)
158+
url === nothing && continue
159+
fn = split(url, '/')[end]
160+
path = joinpath(dirname, fn)
161+
if !isfile(path)
162+
Downloads.download(url, path)
163+
end
164+
if !validate_seq_residues(getsequence(msa, name), getchain(path))
165+
@warn "Residues in $path do not match those in the sequence $name, removing PDB file"
166+
rm(path)
167+
end
168+
end
169+
end
170+
171+
end

src/GPCRAnalysis.jl

Lines changed: 3 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -5,14 +5,10 @@ using Statistics
55
using LinearAlgebra
66

77
using BioStructures
8+
using BioStructures: amino_acid_data
89
using FASTX
910

10-
using MIToS: MIToS, Pfam, MSA
11-
using MIToS.MSA: AbstractMultipleSequenceAlignment, AnnotatedAlignedSequence, AnnotatedMultipleSequenceAlignment,
12-
ReducedAlphabet, GAP, XAA
13-
using MIToS.MSA: getsequence, getannotsequence, getsequencemapping, getresidues, three2residue, sequencenames,
14-
filtersequences!, percentsimilarity
15-
using MIToS.PDB: vanderwaalsradius, ishydrophobic, isaromatic, iscationic, isanionic, ishbonddonor, ishbondacceptor
11+
# using MIToS.PDB: vanderwaalsradius, ishydrophobic, isaromatic, iscationic, isanionic, ishbonddonor, ishbondacceptor
1612

1713
using MultivariateStats
1814
using Distances
@@ -57,6 +53,7 @@ export sidechaincentroid, scvector, inward_tm_residues, inward_ecl_residues
5753
export features_from_structure
5854
export forcecomponents, optimize_weights, forcedict
5955

56+
include("consts.jl")
6057
include("naming_conventions.jl")
6158
include("utils.jl")
6259
include("msa.jl")

src/align.jl

Lines changed: 7 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -279,22 +279,22 @@ end
279279
"""
280280
seqtms = align_ranges(seq1, seq2, seq2ranges::AbstractVector{<:AbstractUnitRange})
281281
282-
Transfer `refranges`, a list of reside index spans in `seq2`, to `seq1`. `seq1` and
282+
Transfer `seq2ranges`, a list of reside index spans in `seq2`, to `seq1`. `seq1` and
283283
`seq2` must be spatially aligned, and the assignment is made by minimizing
284284
inter-chain distance subject to the constraint of preserving sequence order.
285285
"""
286-
function align_ranges(seq1::ChainLike, seq2::AbstractVector{<:AbstractResidue}, refranges::AbstractVector{<:AbstractUnitRange}; kwargs...)
287-
anchoridxs = sizehint!(Int[], length(refranges)*2)
288-
for r in refranges
286+
function align_ranges(seq1::ChainLike, seq2::AbstractVector{<:AbstractResidue}, seq2ranges::AbstractVector{<:AbstractUnitRange}; kwargs...)
287+
anchoridxs = sizehint!(Int[], length(seq2ranges)*2)
288+
for r in seq2ranges
289289
push!(anchoridxs, first(r), last(r))
290290
end
291-
issorted(anchoridxs) || throw(ArgumentError("`refranges` must be strictly increasing spans, got $refranges"))
291+
issorted(anchoridxs) || throw(ArgumentError("`seq2ranges` must be strictly increasing spans, got $seq2ranges"))
292292
ϕ = align_nw(seq1, seq2[anchoridxs], NWGapCosts{Float64}(open1=Inf); kwargs...)
293293
@assert last.(ϕ) == eachindex(anchoridxs)
294294
return [ϕ[i][1]:ϕ[i+1][1] for i in 1:2:length(ϕ)]
295295
end
296-
align_ranges(seq1::ChainLike, seq2::Chain, refranges::AbstractVector{<:AbstractUnitRange}; kwargs...) =
297-
align_ranges(seq1, collectresidues(seq2), refranges; kwargs...)
296+
align_ranges(seq1::ChainLike, seq2::Chain, seq2ranges::AbstractVector{<:AbstractUnitRange}; kwargs...) =
297+
align_ranges(seq1, collectresidues(seq2), seq2ranges; kwargs...)
298298

299299
function score_nw(D::AbstractMatrix, gapcosts::NWGapCosts)
300300
Base.require_one_based_indexing(D)

src/alphafold.jl

Lines changed: 1 addition & 42 deletions
Original file line numberDiff line numberDiff line change
@@ -50,29 +50,6 @@ function alphafoldfiles(dirname=pwd(); join::Bool=false)
5050
return join ? [joinpath(dirname, fn) for fn in fns] : fns
5151
end
5252

53-
"""
54-
msacode2structfile = alphafoldfiles(msa::AnnotatedMultipleSequenceAlignment, dirname=pwd())
55-
56-
Return a dictionary mapping `MSACode`s to the corresponding AlphaFold structure files.
57-
"""
58-
function alphafoldfiles(msa::AnnotatedMultipleSequenceAlignment, dirname=pwd(); join::Bool=false)
59-
afs = alphafoldfiles(dirname)
60-
accesscode2idx = Dict{AccessionCode,Int}()
61-
for (i, af) in pairs(afs)
62-
ac = AccessionCode(match(rex_alphafold_pdbs, af).captures[1])
63-
accesscode2idx[ac] = i
64-
end
65-
msacode2structfile = Dict{MSACode,String}()
66-
for name in sequencenames(msa)
67-
ac = AccessionCode(msa, name)
68-
if haskey(accesscode2idx, ac)
69-
fn = afs[accesscode2idx[ac]]
70-
msacode2structfile[MSACode(name)] = join ? joinpath(dirname, fn) : fn
71-
end
72-
end
73-
return msacode2structfile
74-
end
75-
7653
"""
7754
url = query_alphafold_latest(uniprotXname; format="cif")
7855
@@ -116,7 +93,7 @@ function try_download_alphafold(uniprotXname::AbstractString, path::AbstractStri
11693
end
11794

11895
"""
119-
download_alphafolds(msa::AbstractMultipleSequenceAlignment; dirname=pwd())
96+
download_alphafolds(msa; dirname=pwd())
12097
download_alphafolds(ids; dirname=pwd())
12198
12299
Download all available [AlphaFold](https://alphafold.com/) structures for the sequences in `msa`.
@@ -125,24 +102,6 @@ Missing entries are silently skipped.
125102
If a `AbstractMultipleSequenceAlignment` is provided, the downloaded PDB file is checked to ensure that
126103
the residues in the MSA sequence match those in the PDB file. If they do not match, the PDB file is removed.
127104
"""
128-
function download_alphafolds(msa::AbstractMultipleSequenceAlignment; dirname=pwd(), maxversion=nothing, kwargs...)
129-
maxversion === nothing || @warn "`download_alphafolds`: `maxversion` kwarg has no effect and is deprecated" maxlog=1
130-
@showprogress 1 "Downloading AlphaFold files..." for name in sequencenames(msa)
131-
uname = AccessionCode(msa, name)
132-
url = query_alphafold_latest(uname)
133-
url === nothing && continue
134-
fn = split(url, '/')[end]
135-
path = joinpath(dirname, fn)
136-
if !isfile(path)
137-
Downloads.download(url, path)
138-
end
139-
if !validate_seq_residues(getsequence(msa, name), getchain(path))
140-
@warn "Residues in $path do not match those in the sequence $name, removing PDB file"
141-
rm(path)
142-
end
143-
end
144-
end
145-
146105
function download_alphafolds(ids; dirname=pwd(), kwargs...)
147106
@showprogress 1 "Downloading AlphaFold files..." for uname in ids
148107
url = query_alphafold_latest(uname)

src/analyze.jl

Lines changed: 5 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -1,12 +1,12 @@
11
"""
2-
X = project_sequences(msa::AbstractMultipleSequenceAlignment; fracvar::Real = 0.9)
2+
X = project_sequences(msa; fracvar::Real = 0.9)
33
44
Perform a classical multidimensional scaling analysis to project the sequences in `msa` to a space
55
in which pairwise distances approximately reproduce `100 - percentsimilarity(seq1, seq2)`.
66
The dimensionality is chosen to reconstruction `fracvar` of the variance.
77
"""
8-
function project_sequences(msa::AbstractMultipleSequenceAlignment; fracvar::Real = 0.9)
9-
sim = percentsimilarity(msa)
8+
function project_sequences(msa; fracvar::Real = 0.9)
9+
sim = percent_similarity(msa)
1010
D = 100 .- Matrix(sim)
1111
f = fit(MDS, D; distances=true)
1212
# Capture sufficient variance
@@ -17,8 +17,6 @@ function project_sequences(msa::AbstractMultipleSequenceAlignment; fracvar::Real
1717
return X[1:nd, :]
1818
end
1919

20-
const reduced_code = ReducedAlphabet("(AILMV)(NQST)(RHK)(DE)(FWY)CGP")
21-
2220
function _entropy(v)
2321
count = countitems(v)
2422
e = 0.0
@@ -30,16 +28,13 @@ function _entropy(v)
3028
end
3129

3230
"""
33-
columnwise_entropy(msa::AbstractMultipleSequenceAlignment, aacode = ReducedAlphabet("(AILMV)(NQST)(RHK)(DE)(FWY)CGP"))
31+
columnwise_entropy(msa, aacode = reduced_code)
3432
3533
Compute the entropy of each column in an MSA. Low entropy indicates high conservation.
3634
3735
Unmatched entries (`'-'` residues) contribute to the entropy calculation as if they were an ordinary residue.
3836
"""
39-
function columnwise_entropy(msa::AbstractMultipleSequenceAlignment, aacode=reduced_code)
40-
resnum = map(r -> aacode[r], getresidues(msa))
41-
return map(_entropy, eachcol(resnum))
42-
end
37+
function columnwise_entropy end
4338

4439
"""
4540
residue_centroid(r::AbstractResidue)

0 commit comments

Comments
 (0)