Skip to content

Commit 81144bb

Browse files
authored
Merge pull request #40 from HolyLab/teh/mitosext
Move MIToS support to an extension, add BioStockholm
2 parents 3b3996f + 500e440 commit 81144bb

19 files changed

+818
-231
lines changed

NEWS.md

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,11 @@
1+
# v0.6.0
2+
3+
Breaking changes:
4+
- MIToS support has been moved to an extension. Users of the MSA functionality now need to add `import MIToS` (in addition to `using GPCRAnalysis`) to trigger loading of the extension and support for the corresponding functionality.
5+
6+
New features:
7+
- Support for MSAs loaded with BioStockolm (which can sometimes be used instead of MIToS)
8+
19
# v0.5.0
210

311
Breaking changes:

Project.toml

Lines changed: 9 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
name = "GPCRAnalysis"
22
uuid = "c1d73f9e-d42a-418a-8d5b-c7b00ec0358f"
3+
version = "0.6.0"
34
authors = ["Tim Holy <[email protected]> and contributors"]
4-
version = "0.5.1"
55

66
[deps]
77
BioStructures = "de9282ab-8554-53be-b2d6-f6c222edabfc"
@@ -17,7 +17,6 @@ Hungarian = "e91730f6-4275-51fb-a7a0-7064cfbd3b39"
1717
Interpolations = "a98d9a8b-a2ab-59e6-89dd-64a1c18fca59"
1818
JSON3 = "0f8b85d8-7281-11e9-16c2-39a750bddbf1"
1919
LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e"
20-
MIToS = "51bafb47-8a16-5ded-8b04-24ef4eede0b5"
2120
MultivariateStats = "6f286f6a-111f-5878-ab1e-185364afe411"
2221
MutableConvexHulls = "948c7aac-0e5e-4631-af23-7a6bb7a17825"
2322
OffsetArrays = "6fe1bfb0-de20-5000-8ca7-80f57d26f881"
@@ -28,13 +27,18 @@ Statistics = "10745b16-79ce-11e8-11f9-7d13ad32a3b2"
2827
TravelingSalesmanHeuristics = "8c8f4381-2cdd-507c-846c-be2bcff6f45f"
2928

3029
[weakdeps]
30+
BioStockholm = "eeb925a3-6f9d-43e6-829e-e0ea03b76ecf"
3131
HiGHS = "87dc4568-4c63-4d18-b0c0-bb2238e4078b"
3232
JuMP = "4076af6c-e467-56ae-b986-b466b2749572"
33+
MIToS = "51bafb47-8a16-5ded-8b04-24ef4eede0b5"
3334

3435
[extensions]
36+
GPCRAnalysisBioStockholmExt = "BioStockholm"
3537
GPCRAnalysisJuMPExt = ["JuMP", "HiGHS"]
38+
GPCRAnalysisMIToSExt = "MIToS"
3639

3740
[compat]
41+
BioStockholm = "0.2.1"
3842
BioStructures = "4.2"
3943
ColorTypes = "0.11, 0.12"
4044
Distances = "0.10"
@@ -62,10 +66,12 @@ TravelingSalesmanHeuristics = "0.3"
6266
julia = "1.10"
6367

6468
[extras]
69+
BioStockholm = "eeb925a3-6f9d-43e6-829e-e0ea03b76ecf"
6570
HiGHS = "87dc4568-4c63-4d18-b0c0-bb2238e4078b"
6671
InvertedIndices = "41ab1584-1d38-5bbf-9106-f11c6c58b48f"
6772
JuMP = "4076af6c-e467-56ae-b986-b466b2749572"
73+
MIToS = "51bafb47-8a16-5ded-8b04-24ef4eede0b5"
6874
Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40"
6975

7076
[targets]
71-
test = ["HiGHS", "InvertedIndices", "JuMP", "Test"]
77+
test = ["BioStockholm", "HiGHS", "InvertedIndices", "JuMP", "MIToS", "Test"]

ext/GPCRAnalysisBioStockholmExt.jl

Lines changed: 99 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,99 @@
1+
module GPCRAnalysisBioStockholmExt
2+
3+
using GPCRAnalysis
4+
using BioStockholm
5+
using BioStockholm: OrderedDict # from OrderedCollections.jl
6+
7+
function conscols(msa::MSA)
8+
ss = msa.GC["SS_cons"]
9+
return findfirst(!=( '.'), ss):findlast(!=( '.'), ss)
10+
end
11+
12+
# Low-level API implementation
13+
# GPCRAnalysis.sequenceindexes(msaseq::AnnotatedAlignedSequence) = getsequencemapping(msaseq)
14+
# GPCRAnalysis.sequenceindexes(msaseq::MSA, i::Int) = getsequencemapping(msaseq, i)
15+
GPCRAnalysis.sequencekeys(msa::MSA) = collect(keys(msa.seq))
16+
GPCRAnalysis.msasequence(msa::MSA, key) = msa.seq[key][conscols(msa)]
17+
function GPCRAnalysis.residuematrix(msa::MSA)
18+
keepcols = conscols(msa)
19+
reduce(vcat, [permutedims(seq[keepcols]) for (_, seq) in msa.seq])
20+
end
21+
GPCRAnalysis.subseqs(msa::MSA{T}, rowmask::AbstractVector{Bool}) where T = MSA{T}(OrderedDict(pr for (pr, keep) in zip(msa.seq, rowmask) if keep), msa.GF, OrderedDict(pr for (pr, keep) in zip(msa.GS, rowmask) if keep), msa.GC, msa.GR)
22+
function GPCRAnalysis.subseqs!(msa::MSA, rowmask::AbstractVector{Bool})
23+
for ((key, _), keep) in zip(msa.seq, rowmask)
24+
if !keep
25+
delete!(msa.seq, key)
26+
delete!(msa.GS, key)
27+
end
28+
end
29+
return msa
30+
end
31+
GPCRAnalysis.columnindexes(msa::BioStockholm.MSA) = conscols(msa)
32+
33+
Base.getindex(msa::MSA, seqname::MSACode) = msa.seq[seqname.name][conscols(msa)]
34+
Base.getindex(msa::MSA, seqname::AccessionCode) = msa[MSACode(msa, seqname)]
35+
36+
37+
function GPCRAnalysis.AccessionCode(msa::MSA, seqname::AbstractString)
38+
AccessionCode(split(msa.GS[seqname]["AC"], '.')[1])
39+
end
40+
GPCRAnalysis.AccessionCode(msa::MSA, seqname::MSACode) = AccessionCode(msa, seqname.name)
41+
GPCRAnalysis.AccessionCode(::MSA, seqname::AccessionCode) = seqname
42+
43+
function GPCRAnalysis.MSACode(msa::MSA, accession::AbstractString)
44+
acs = [split(ac["AC"], '.')[1] for (_, ac) in msa.GS]
45+
i = findfirst(==(accession), acs)
46+
return MSACode(GPCRAnalysis.sequencekeys(msa)[i])
47+
end
48+
GPCRAnalysis.MSACode(msa::MSA, accession::AccessionCode) = MSACode(msa, accession.name)
49+
GPCRAnalysis.MSACode(::MSA, accession::MSACode) = accession
50+
51+
52+
function reduced_alphabet(r::Char)
53+
if r == '-'
54+
return 0
55+
elseif r in ('A','I','L','M','V')
56+
return 1 # hydrophobic
57+
elseif r in ('N','Q','S','T')
58+
return 2 # polar
59+
elseif r in ('R','H','K')
60+
return 3 # charged
61+
elseif r in ('D','E')
62+
return 4 # charged
63+
elseif r in ('F','W','Y')
64+
return 5 # aromatic
65+
end
66+
offset = findfirst(==(r), ('C','G','P'))
67+
offset === nothing && throw(ArgumentError("Unknown residue '$r'"))
68+
return 5 + offset # special or unknown
69+
end
70+
71+
GPCRAnalysis.columnwise_entropy(msa) = columnwise_entropy(reduced_alphabet, msa)
72+
73+
function GPCRAnalysis.percent_similarity(f, msa::MSA)
74+
# This mimics MIToS's implementation
75+
function pctsim(v1, v2)
76+
same = l = 0
77+
for (a, b) in zip(v1, v2)
78+
a == b == 0 && continue # skip gaps
79+
same += a == b
80+
l += 1
81+
end
82+
return 100 * same / l
83+
end
84+
85+
M = f.(GPCRAnalysis.residuematrix(msa))
86+
n = size(M, 1)
87+
S = zeros(Float64, n, n)
88+
for i in 1:n
89+
for j in i:n
90+
S[i, j] = pctsim(M[i, :], M[j, :])
91+
S[j, i] = S[i, j]
92+
end
93+
end
94+
return S
95+
end
96+
GPCRAnalysis.percent_similarity(msa::MSA) = GPCRAnalysis.percent_similarity(reduced_alphabet, msa)
97+
98+
99+
end

ext/GPCRAnalysisMIToSExt.jl

Lines changed: 76 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,76 @@
1+
module GPCRAnalysisMIToSExt
2+
3+
using GPCRAnalysis
4+
using Downloads
5+
using BioStructures
6+
using ProgressMeter
7+
8+
using GPCRAnalysis: ChainLike, ResidueLike, StructureLike, _entropy, validate_seq_residues, rex_alphafold_pdbs
9+
10+
using MIToS: MIToS, Pfam, MSA
11+
using MIToS.MSA: AbstractMultipleSequenceAlignment, AnnotatedAlignedSequence, AnnotatedMultipleSequenceAlignment,
12+
ReducedAlphabet, ResidueAlphabet, GAP, XAA
13+
using MIToS.MSA: getsequence, getannotsequence, getsequencemapping, getresidues, three2residue, sequencenames,
14+
filtersequences, filtersequences!, percentsimilarity, getcolumnmapping
15+
16+
17+
# Low-level API implementation
18+
GPCRAnalysis.sequenceindexes(msaseq::AnnotatedAlignedSequence) = getsequencemapping(msaseq)
19+
GPCRAnalysis.sequenceindexes(msaseq::AbstractMultipleSequenceAlignment, i::Int) = getsequencemapping(msaseq, i)
20+
GPCRAnalysis.isgap(res::MSA.Residue) = res == GAP
21+
GPCRAnalysis.isunknown(res::MSA.Residue) = res == XAA
22+
GPCRAnalysis.sequencekeys(msa::AbstractMultipleSequenceAlignment) = sequencenames(msa)
23+
GPCRAnalysis.msasequence(msa::AbstractMultipleSequenceAlignment, key) = getsequence(msa, key)
24+
GPCRAnalysis.residuematrix(msa::AbstractMultipleSequenceAlignment) = getresidues(msa)
25+
GPCRAnalysis.subseqs(msa::AbstractMultipleSequenceAlignment, rowmask) = filtersequences(msa, rowmask)
26+
GPCRAnalysis.subseqs!(msa::AbstractMultipleSequenceAlignment, rowmask) = filtersequences!(msa, rowmask)
27+
GPCRAnalysis.percent_similarity(msa::AbstractMultipleSequenceAlignment) = percentsimilarity(msa)
28+
GPCRAnalysis.columnindexes(msa::MSA.AbstractMultipleSequenceAlignment) = getcolumnmapping(msa)
29+
30+
Base.getindex(msa::AbstractMultipleSequenceAlignment, seqname::MSACode) = getsequence(msa, seqname.name)
31+
Base.getindex(msa::AbstractMultipleSequenceAlignment, seqname::AccessionCode) = getsequence(msa, MSACode(msa, seqname).name)
32+
33+
function GPCRAnalysis.AccessionCode(msa::AnnotatedMultipleSequenceAlignment, seqname::AbstractString)
34+
AccessionCode(uniprotX(getannotsequence(msa, seqname, "AC", seqname)))
35+
end
36+
GPCRAnalysis.AccessionCode(msa::AnnotatedMultipleSequenceAlignment, seqname::MSACode) = AccessionCode(msa, seqname.name)
37+
GPCRAnalysis.AccessionCode(::AnnotatedMultipleSequenceAlignment, seqname::AccessionCode) = seqname
38+
39+
function GPCRAnalysis.MSACode(msa::AnnotatedMultipleSequenceAlignment, accession::AbstractString)
40+
seqnames = sequencenames(msa)
41+
return MSACode(seqnames[findfirst(x -> AccessionCode(msa, x).name == accession, seqnames)])
42+
end
43+
GPCRAnalysis.MSACode(msa::AnnotatedMultipleSequenceAlignment, accession::AccessionCode) = MSACode(msa, accession.name)
44+
GPCRAnalysis.MSACode(::AnnotatedMultipleSequenceAlignment, accession::MSACode) = accession
45+
46+
GPCRAnalysis.SequenceMapping(seq::AnnotatedAlignedSequence) = SequenceMapping(getsequencemapping(seq))
47+
48+
# Move this to MIToS?
49+
if !hasmethod(getsequencemapping, Tuple{AnnotatedAlignedSequence})
50+
function MIToS.MSA.getsequencemapping(seq::AnnotatedAlignedSequence)
51+
getsequencemapping(seq, sequencenames(seq)[1])
52+
end
53+
function MIToS.MSA.getsequencemapping(msa::Union{AnnotatedAlignedSequence,AnnotatedMultipleSequenceAlignment}, seq_id::String)
54+
MIToS.MSA._str2int_mapping(getannotsequence(msa, seq_id, "SeqMap"))
55+
end
56+
function MIToS.MSA.getsequencemapping(msa::AnnotatedMultipleSequenceAlignment, seqid::Regex)
57+
id = findfirst(str -> occursin(seqid, str), sequencenames(msa))
58+
getsequencemapping(msa, id)
59+
end
60+
end
61+
62+
const reduced_code = ReducedAlphabet("(AILMV)(NQST)(RHK)(DE)(FWY)CGP")
63+
64+
"""
65+
columnwise_entropy(msa, aacode = reduced_code)
66+
67+
Call `columnwise_entropy` after mapping each residue through `aacode`.
68+
69+
The default code is `ReducedAlphabet("(AILMV)(NQST)(RHK)(DE)(FWY)CGP")`, which
70+
groups residues into categories hydrophobic, polar, charged, aromatic, and
71+
"special."
72+
"""
73+
GPCRAnalysis.columnwise_entropy(msa::AbstractMultipleSequenceAlignment, aacode::ResidueAlphabet=reduced_code) =
74+
GPCRAnalysis.columnwise_entropy(r -> aacode[r], msa)
75+
76+
end

src/GPCRAnalysis.jl

Lines changed: 4 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -5,15 +5,9 @@ using Statistics
55
using LinearAlgebra
66

77
using BioStructures
8+
using BioStructures: amino_acid_data
89
using FASTX
910

10-
using MIToS: MIToS, Pfam, MSA
11-
using MIToS.MSA: AbstractMultipleSequenceAlignment, AnnotatedAlignedSequence, AnnotatedMultipleSequenceAlignment,
12-
ReducedAlphabet, GAP, XAA
13-
using MIToS.MSA: getsequence, getannotsequence, getsequencemapping, getresidues, three2residue, sequencenames,
14-
filtersequences!, percentsimilarity
15-
using MIToS.PDB: vanderwaalsradius, ishydrophobic, isaromatic, iscationic, isanionic, ishbonddonor, ishbondacceptor
16-
1711
using MultivariateStats
1812
using Distances
1913
using OffsetArrays
@@ -57,9 +51,11 @@ export sidechaincentroid, scvector, inward_tm_residues, inward_ecl_residues
5751
export features_from_structure
5852
export forcecomponents, optimize_weights, forcedict
5953

54+
include("consts.jl")
6055
include("naming_conventions.jl")
61-
include("utils.jl")
6256
include("msa.jl")
57+
include("utils.jl")
58+
include("query.jl")
6359
include("alphafold.jl")
6460
include("analyze.jl")
6561
include("tmalign.jl")

src/align.jl

Lines changed: 7 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -279,22 +279,22 @@ end
279279
"""
280280
seqtms = align_ranges(seq1, seq2, seq2ranges::AbstractVector{<:AbstractUnitRange})
281281
282-
Transfer `refranges`, a list of reside index spans in `seq2`, to `seq1`. `seq1` and
282+
Transfer `seq2ranges`, a list of reside index spans in `seq2`, to `seq1`. `seq1` and
283283
`seq2` must be spatially aligned, and the assignment is made by minimizing
284284
inter-chain distance subject to the constraint of preserving sequence order.
285285
"""
286-
function align_ranges(seq1::ChainLike, seq2::AbstractVector{<:AbstractResidue}, refranges::AbstractVector{<:AbstractUnitRange}; kwargs...)
287-
anchoridxs = sizehint!(Int[], length(refranges)*2)
288-
for r in refranges
286+
function align_ranges(seq1::ChainLike, seq2::AbstractVector{<:AbstractResidue}, seq2ranges::AbstractVector{<:AbstractUnitRange}; kwargs...)
287+
anchoridxs = sizehint!(Int[], length(seq2ranges)*2)
288+
for r in seq2ranges
289289
push!(anchoridxs, first(r), last(r))
290290
end
291-
issorted(anchoridxs) || throw(ArgumentError("`refranges` must be strictly increasing spans, got $refranges"))
291+
issorted(anchoridxs) || throw(ArgumentError("`seq2ranges` must be strictly increasing spans, got $seq2ranges"))
292292
ϕ = align_nw(seq1, seq2[anchoridxs], NWGapCosts{Float64}(open1=Inf); kwargs...)
293293
@assert last.(ϕ) == eachindex(anchoridxs)
294294
return [ϕ[i][1]:ϕ[i+1][1] for i in 1:2:length(ϕ)]
295295
end
296-
align_ranges(seq1::ChainLike, seq2::Chain, refranges::AbstractVector{<:AbstractUnitRange}; kwargs...) =
297-
align_ranges(seq1, collectresidues(seq2), refranges; kwargs...)
296+
align_ranges(seq1::ChainLike, seq2::Chain, seq2ranges::AbstractVector{<:AbstractUnitRange}; kwargs...) =
297+
align_ranges(seq1, collectresidues(seq2), seq2ranges; kwargs...)
298298

299299
function score_nw(D::AbstractMatrix, gapcosts::NWGapCosts)
300300
Base.require_one_based_indexing(D)

src/alphafold.jl

Lines changed: 14 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -32,7 +32,7 @@ end
3232
Return the latest version of all AlphaFold files in `dirname`.
3333
If `join` is `true`, then the full paths are returned.
3434
"""
35-
function alphafoldfiles(dirname=pwd(); join::Bool=false)
35+
function alphafoldfiles(dirname::AbstractString=pwd(); join::Bool=false)
3636
latest = Dict{String,Int}()
3737
latestfn = Dict{String,String}()
3838
for fn in readdir(dirname)
@@ -51,19 +51,19 @@ function alphafoldfiles(dirname=pwd(); join::Bool=false)
5151
end
5252

5353
"""
54-
msacode2structfile = alphafoldfiles(msa::AnnotatedMultipleSequenceAlignment, dirname=pwd())
54+
msacode2structfile = alphafoldfiles(msa, dirname=pwd())
5555
5656
Return a dictionary mapping `MSACode`s to the corresponding AlphaFold structure files.
5757
"""
58-
function alphafoldfiles(msa::AnnotatedMultipleSequenceAlignment, dirname=pwd(); join::Bool=false)
58+
function alphafoldfiles(msa, dirname::AbstractString=pwd(); join::Bool=false)
5959
afs = alphafoldfiles(dirname)
6060
accesscode2idx = Dict{AccessionCode,Int}()
6161
for (i, af) in pairs(afs)
6262
ac = AccessionCode(match(rex_alphafold_pdbs, af).captures[1])
6363
accesscode2idx[ac] = i
6464
end
6565
msacode2structfile = Dict{MSACode,String}()
66-
for name in sequencenames(msa)
66+
for name in sequencekeys(msa)
6767
ac = AccessionCode(msa, name)
6868
if haskey(accesscode2idx, ac)
6969
fn = afs[accesscode2idx[ac]]
@@ -116,18 +116,19 @@ function try_download_alphafold(uniprotXname::AbstractString, path::AbstractStri
116116
end
117117

118118
"""
119-
download_alphafolds(msa::AbstractMultipleSequenceAlignment; dirname=pwd())
119+
download_alphafolds(msa; dirname=pwd())
120120
download_alphafolds(ids; dirname=pwd())
121121
122-
Download all available [AlphaFold](https://alphafold.com/) structures for the sequences in `msa`.
123-
Missing entries are silently skipped.
122+
Download all available [AlphaFold](https://alphafold.com/) structures for the
123+
sequences in `msa`. Missing entries are silently skipped.
124124
125-
If a `AbstractMultipleSequenceAlignment` is provided, the downloaded PDB file is checked to ensure that
126-
the residues in the MSA sequence match those in the PDB file. If they do not match, the PDB file is removed.
125+
If an `msa` is provided, each downloaded PDB file is checked to ensure that the
126+
residues in the MSA sequence match those in the PDB file. If they do not match,
127+
the PDB file is removed.
127128
"""
128-
function download_alphafolds(msa::AbstractMultipleSequenceAlignment; dirname=pwd(), maxversion=nothing, kwargs...)
129+
function download_alphafolds(msa; dirname=pwd(), maxversion=nothing, kwargs...)
129130
maxversion === nothing || @warn "`download_alphafolds`: `maxversion` kwarg has no effect and is deprecated" maxlog=1
130-
@showprogress 1 "Downloading AlphaFold files..." for name in sequencenames(msa)
131+
@showprogress 1 "Downloading AlphaFold files..." for name in sequencekeys(msa)
131132
uname = AccessionCode(msa, name)
132133
url = query_alphafold_latest(uname)
133134
url === nothing && continue
@@ -136,14 +137,14 @@ function download_alphafolds(msa::AbstractMultipleSequenceAlignment; dirname=pwd
136137
if !isfile(path)
137138
Downloads.download(url, path)
138139
end
139-
if !validate_seq_residues(getsequence(msa, name), getchain(path))
140+
if !validate_seq_residues(msasequence(msa, name), getchain(path))
140141
@warn "Residues in $path do not match those in the sequence $name, removing PDB file"
141142
rm(path)
142143
end
143144
end
144145
end
145146

146-
function download_alphafolds(ids; dirname=pwd(), kwargs...)
147+
function download_alphafolds(ids::AbstractVector{<:AbstractString}; dirname=pwd(), kwargs...)
147148
@showprogress 1 "Downloading AlphaFold files..." for uname in ids
148149
url = query_alphafold_latest(uname)
149150
url === nothing && continue

0 commit comments

Comments
 (0)