HolyLab
diff --git a/‎NEWS.md‎
Lines changed: 8 additions & 0 deletions b/‎NEWS.md‎
Lines changed: 8 additions & 0 deletions
diff --git a/‎Project.toml‎
Lines changed: 9 additions & 3 deletions b/‎Project.toml‎
Lines changed: 9 additions & 3 deletions
diff --git a/‎ext/GPCRAnalysisBioStockholmExt.jl‎
Lines changed: 99 additions & 0 deletions b/‎ext/GPCRAnalysisBioStockholmExt.jl‎
Lines changed: 99 additions & 0 deletions
diff --git a/‎ext/GPCRAnalysisMIToSExt.jl‎
Lines changed: 76 additions & 0 deletions b/‎ext/GPCRAnalysisMIToSExt.jl‎
Lines changed: 76 additions & 0 deletions
diff --git a/‎src/GPCRAnalysis.jl‎
Lines changed: 4 additions & 8 deletions b/‎src/GPCRAnalysis.jl‎
Lines changed: 4 additions & 8 deletions
diff --git a/‎src/align.jl‎
Lines changed: 7 additions & 7 deletions b/‎src/align.jl‎
Lines changed: 7 additions & 7 deletions
diff --git a/‎src/alphafold.jl‎
Lines changed: 14 additions & 13 deletions b/‎src/alphafold.jl‎
Lines changed: 14 additions & 13 deletions
@@ -1,3 +1,11 @@
+# v0.6.0
+
+Breaking changes:
+- MIToS support has been moved to an extension. Users of the MSA functionality now need to add `import MIToS` (in addition to `using GPCRAnalysis`) to trigger loading of the extension and support for the corresponding functionality.
+
+New features:
+- Support for MSAs loaded with BioStockolm (which can sometimes be used instead of MIToS)
+
 # v0.5.0
 
 Breaking changes:
 
@@ -1,7 +1,7 @@
 name = "GPCRAnalysis"
 uuid = "c1d73f9e-d42a-418a-8d5b-c7b00ec0358f"
+version = "0.6.0"
 authors = ["Tim Holy <[email protected]> and contributors"]
-version = "0.5.1"
 
 [deps]
 BioStructures = "de9282ab-8554-53be-b2d6-f6c222edabfc"
@@ -17,7 +17,6 @@ Hungarian = "e91730f6-4275-51fb-a7a0-7064cfbd3b39"
 Interpolations = "a98d9a8b-a2ab-59e6-89dd-64a1c18fca59"
 JSON3 = "0f8b85d8-7281-11e9-16c2-39a750bddbf1"
 LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e"
-MIToS = "51bafb47-8a16-5ded-8b04-24ef4eede0b5"
 MultivariateStats = "6f286f6a-111f-5878-ab1e-185364afe411"
 MutableConvexHulls = "948c7aac-0e5e-4631-af23-7a6bb7a17825"
 OffsetArrays = "6fe1bfb0-de20-5000-8ca7-80f57d26f881"
@@ -28,13 +27,18 @@ Statistics = "10745b16-79ce-11e8-11f9-7d13ad32a3b2"
 TravelingSalesmanHeuristics = "8c8f4381-2cdd-507c-846c-be2bcff6f45f"
 
 [weakdeps]
+BioStockholm = "eeb925a3-6f9d-43e6-829e-e0ea03b76ecf"
 HiGHS = "87dc4568-4c63-4d18-b0c0-bb2238e4078b"
 JuMP = "4076af6c-e467-56ae-b986-b466b2749572"
+MIToS = "51bafb47-8a16-5ded-8b04-24ef4eede0b5"
 
 [extensions]
+GPCRAnalysisBioStockholmExt = "BioStockholm"
 GPCRAnalysisJuMPExt = ["JuMP", "HiGHS"]
+GPCRAnalysisMIToSExt = "MIToS"
 
 [compat]
+BioStockholm = "0.2.1"
 BioStructures = "4.2"
 ColorTypes = "0.11, 0.12"
 Distances = "0.10"
@@ -62,10 +66,12 @@ TravelingSalesmanHeuristics = "0.3"
 julia = "1.10"
 
 [extras]
+BioStockholm = "eeb925a3-6f9d-43e6-829e-e0ea03b76ecf"
 HiGHS = "87dc4568-4c63-4d18-b0c0-bb2238e4078b"
 InvertedIndices = "41ab1584-1d38-5bbf-9106-f11c6c58b48f"
 JuMP = "4076af6c-e467-56ae-b986-b466b2749572"
+MIToS = "51bafb47-8a16-5ded-8b04-24ef4eede0b5"
 Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40"
 
 [targets]
-test = ["HiGHS", "InvertedIndices", "JuMP", "Test"]
+test = ["BioStockholm", "HiGHS", "InvertedIndices", "JuMP", "MIToS", "Test"]
@@ -0,0 +1,99 @@
+module GPCRAnalysisBioStockholmExt
+
+using GPCRAnalysis
+using BioStockholm
+using BioStockholm: OrderedDict   # from OrderedCollections.jl
+
+function conscols(msa::MSA)
+    ss = msa.GC["SS_cons"]
+    return findfirst(!=( '.'), ss):findlast(!=( '.'), ss)
+end
+
+# Low-level API implementation
+# GPCRAnalysis.sequenceindexes(msaseq::AnnotatedAlignedSequence) = getsequencemapping(msaseq)
+# GPCRAnalysis.sequenceindexes(msaseq::MSA, i::Int) = getsequencemapping(msaseq, i)
+GPCRAnalysis.sequencekeys(msa::MSA) = collect(keys(msa.seq))
+GPCRAnalysis.msasequence(msa::MSA, key) = msa.seq[key][conscols(msa)]
+function GPCRAnalysis.residuematrix(msa::MSA)
+    keepcols = conscols(msa)
+    reduce(vcat, [permutedims(seq[keepcols]) for (_, seq) in msa.seq])
+end
+GPCRAnalysis.subseqs(msa::MSA{T}, rowmask::AbstractVector{Bool}) where T = MSA{T}(OrderedDict(pr for (pr, keep) in zip(msa.seq, rowmask) if keep), msa.GF, OrderedDict(pr for (pr, keep) in zip(msa.GS, rowmask) if keep), msa.GC, msa.GR)
+function GPCRAnalysis.subseqs!(msa::MSA, rowmask::AbstractVector{Bool})
+    for ((key, _), keep) in zip(msa.seq, rowmask)
+        if !keep
+            delete!(msa.seq, key)
+            delete!(msa.GS, key)
+        end
+    end
+    return msa
+end
+GPCRAnalysis.columnindexes(msa::BioStockholm.MSA) = conscols(msa)
+
+Base.getindex(msa::MSA, seqname::MSACode) = msa.seq[seqname.name][conscols(msa)]
+Base.getindex(msa::MSA, seqname::AccessionCode) = msa[MSACode(msa, seqname)]
+
+
+function GPCRAnalysis.AccessionCode(msa::MSA, seqname::AbstractString)
+    AccessionCode(split(msa.GS[seqname]["AC"], '.')[1])
+end
+GPCRAnalysis.AccessionCode(msa::MSA, seqname::MSACode) = AccessionCode(msa, seqname.name)
+GPCRAnalysis.AccessionCode(::MSA, seqname::AccessionCode) = seqname
+
+function GPCRAnalysis.MSACode(msa::MSA, accession::AbstractString)
+    acs = [split(ac["AC"], '.')[1] for (_, ac) in msa.GS]
+    i = findfirst(==(accession), acs)
+    return MSACode(GPCRAnalysis.sequencekeys(msa)[i])
+end
+GPCRAnalysis.MSACode(msa::MSA, accession::AccessionCode) = MSACode(msa, accession.name)
+GPCRAnalysis.MSACode(::MSA, accession::MSACode) = accession
+
+
+function reduced_alphabet(r::Char)
+    if r == '-'
+        return 0
+    elseif r in ('A','I','L','M','V')
+        return 1  # hydrophobic
+    elseif r in ('N','Q','S','T')
+        return 2  # polar
+    elseif r in ('R','H','K')
+        return 3  # charged
+    elseif r in ('D','E')
+        return 4  # charged
+    elseif r in ('F','W','Y')
+        return 5  # aromatic
+    end
+    offset = findfirst(==(r), ('C','G','P'))
+    offset === nothing && throw(ArgumentError("Unknown residue '$r'"))
+    return 5 + offset  # special or unknown
+end
+
+GPCRAnalysis.columnwise_entropy(msa) = columnwise_entropy(reduced_alphabet, msa)
+
+function GPCRAnalysis.percent_similarity(f, msa::MSA)
+    # This mimics MIToS's implementation
+    function pctsim(v1, v2)
+        same = l = 0
+        for (a, b) in zip(v1, v2)
+            a == b == 0 && continue  # skip gaps
+            same += a == b
+            l += 1
+        end
+        return 100 * same / l
+    end
+
+    M = f.(GPCRAnalysis.residuematrix(msa))
+    n = size(M, 1)
+    S = zeros(Float64, n, n)
+    for i in 1:n
+        for j in i:n
+            S[i, j] = pctsim(M[i, :], M[j, :])
+            S[j, i] = S[i, j]
+        end
+    end
+    return S
+end
+GPCRAnalysis.percent_similarity(msa::MSA) = GPCRAnalysis.percent_similarity(reduced_alphabet, msa)
+
+
+end
@@ -0,0 +1,76 @@
+module GPCRAnalysisMIToSExt
+
+using GPCRAnalysis
+using Downloads
+using BioStructures
+using ProgressMeter
+
+using GPCRAnalysis: ChainLike, ResidueLike, StructureLike, _entropy, validate_seq_residues, rex_alphafold_pdbs
+
+using MIToS: MIToS, Pfam, MSA
+using MIToS.MSA: AbstractMultipleSequenceAlignment, AnnotatedAlignedSequence, AnnotatedMultipleSequenceAlignment,
+                 ReducedAlphabet, ResidueAlphabet, GAP, XAA
+using MIToS.MSA: getsequence, getannotsequence, getsequencemapping, getresidues, three2residue, sequencenames,
+                 filtersequences, filtersequences!, percentsimilarity, getcolumnmapping
+
+
+# Low-level API implementation
+GPCRAnalysis.sequenceindexes(msaseq::AnnotatedAlignedSequence) = getsequencemapping(msaseq)
+GPCRAnalysis.sequenceindexes(msaseq::AbstractMultipleSequenceAlignment, i::Int) = getsequencemapping(msaseq, i)
+GPCRAnalysis.isgap(res::MSA.Residue) = res == GAP
+GPCRAnalysis.isunknown(res::MSA.Residue) = res == XAA
+GPCRAnalysis.sequencekeys(msa::AbstractMultipleSequenceAlignment) = sequencenames(msa)
+GPCRAnalysis.msasequence(msa::AbstractMultipleSequenceAlignment, key) = getsequence(msa, key)
+GPCRAnalysis.residuematrix(msa::AbstractMultipleSequenceAlignment) = getresidues(msa)
+GPCRAnalysis.subseqs(msa::AbstractMultipleSequenceAlignment, rowmask)  = filtersequences(msa, rowmask)
+GPCRAnalysis.subseqs!(msa::AbstractMultipleSequenceAlignment, rowmask) = filtersequences!(msa, rowmask)
+GPCRAnalysis.percent_similarity(msa::AbstractMultipleSequenceAlignment) = percentsimilarity(msa)
+GPCRAnalysis.columnindexes(msa::MSA.AbstractMultipleSequenceAlignment) = getcolumnmapping(msa)
+
+Base.getindex(msa::AbstractMultipleSequenceAlignment, seqname::MSACode) = getsequence(msa, seqname.name)
+Base.getindex(msa::AbstractMultipleSequenceAlignment, seqname::AccessionCode) = getsequence(msa, MSACode(msa, seqname).name)
+
+function GPCRAnalysis.AccessionCode(msa::AnnotatedMultipleSequenceAlignment, seqname::AbstractString)
+    AccessionCode(uniprotX(getannotsequence(msa, seqname, "AC", seqname)))
+end
+GPCRAnalysis.AccessionCode(msa::AnnotatedMultipleSequenceAlignment, seqname::MSACode) = AccessionCode(msa, seqname.name)
+GPCRAnalysis.AccessionCode(::AnnotatedMultipleSequenceAlignment, seqname::AccessionCode) = seqname
+
+function GPCRAnalysis.MSACode(msa::AnnotatedMultipleSequenceAlignment, accession::AbstractString)
+    seqnames = sequencenames(msa)
+    return MSACode(seqnames[findfirst(x -> AccessionCode(msa, x).name == accession, seqnames)])
+end
+GPCRAnalysis.MSACode(msa::AnnotatedMultipleSequenceAlignment, accession::AccessionCode) = MSACode(msa, accession.name)
+GPCRAnalysis.MSACode(::AnnotatedMultipleSequenceAlignment, accession::MSACode) = accession
+
+GPCRAnalysis.SequenceMapping(seq::AnnotatedAlignedSequence) = SequenceMapping(getsequencemapping(seq))
+
+# Move this to MIToS?
+if !hasmethod(getsequencemapping, Tuple{AnnotatedAlignedSequence})
+    function MIToS.MSA.getsequencemapping(seq::AnnotatedAlignedSequence)
+        getsequencemapping(seq, sequencenames(seq)[1])
+    end
+    function MIToS.MSA.getsequencemapping(msa::Union{AnnotatedAlignedSequence,AnnotatedMultipleSequenceAlignment}, seq_id::String)
+        MIToS.MSA._str2int_mapping(getannotsequence(msa, seq_id, "SeqMap"))
+    end
+    function MIToS.MSA.getsequencemapping(msa::AnnotatedMultipleSequenceAlignment, seqid::Regex)
+        id = findfirst(str -> occursin(seqid, str), sequencenames(msa))
+        getsequencemapping(msa, id)
+    end
+end
+
+const reduced_code = ReducedAlphabet("(AILMV)(NQST)(RHK)(DE)(FWY)CGP")
+
+"""
+    columnwise_entropy(msa, aacode = reduced_code)
+
+Call `columnwise_entropy` after mapping each residue through `aacode`.
+
+The default code is `ReducedAlphabet("(AILMV)(NQST)(RHK)(DE)(FWY)CGP")`, which
+groups residues into categories hydrophobic, polar, charged, aromatic, and
+"special."
+"""
+GPCRAnalysis.columnwise_entropy(msa::AbstractMultipleSequenceAlignment, aacode::ResidueAlphabet=reduced_code) =
+    GPCRAnalysis.columnwise_entropy(r -> aacode[r], msa)
+
+end
@@ -5,15 +5,9 @@ using Statistics
 using LinearAlgebra
 
 using BioStructures
+using BioStructures: amino_acid_data
 using FASTX
 
-using MIToS: MIToS, Pfam, MSA
-using MIToS.MSA: AbstractMultipleSequenceAlignment, AnnotatedAlignedSequence, AnnotatedMultipleSequenceAlignment,
-                 ReducedAlphabet, GAP, XAA
-using MIToS.MSA: getsequence, getannotsequence, getsequencemapping, getresidues, three2residue, sequencenames,
-                 filtersequences!, percentsimilarity
-using MIToS.PDB: vanderwaalsradius, ishydrophobic, isaromatic, iscationic, isanionic, ishbonddonor, ishbondacceptor
-
 using MultivariateStats
 using Distances
 using OffsetArrays
@@ -57,9 +51,11 @@ export sidechaincentroid, scvector, inward_tm_residues, inward_ecl_residues
 export features_from_structure
 export forcecomponents, optimize_weights, forcedict
 
+include("consts.jl")
 include("naming_conventions.jl")
-include("utils.jl")
 include("msa.jl")
+include("utils.jl")
+include("query.jl")
 include("alphafold.jl")
 include("analyze.jl")
 include("tmalign.jl")
 
@@ -279,22 +279,22 @@ end
 """
     seqtms = align_ranges(seq1, seq2, seq2ranges::AbstractVector{<:AbstractUnitRange})
 
-Transfer `refranges`, a list of reside index spans in `seq2`, to `seq1`. `seq1` and
+Transfer `seq2ranges`, a list of reside index spans in `seq2`, to `seq1`. `seq1` and
 `seq2` must be spatially aligned, and the assignment is made by minimizing
 inter-chain distance subject to the constraint of preserving sequence order.
 """
-function align_ranges(seq1::ChainLike, seq2::AbstractVector{<:AbstractResidue}, refranges::AbstractVector{<:AbstractUnitRange}; kwargs...)
-    anchoridxs = sizehint!(Int[], length(refranges)*2)
-    for r in refranges
+function align_ranges(seq1::ChainLike, seq2::AbstractVector{<:AbstractResidue}, seq2ranges::AbstractVector{<:AbstractUnitRange}; kwargs...)
+    anchoridxs = sizehint!(Int[], length(seq2ranges)*2)
+    for r in seq2ranges
         push!(anchoridxs, first(r), last(r))
     end
-    issorted(anchoridxs) || throw(ArgumentError("`refranges` must be strictly increasing spans, got $refranges"))
+    issorted(anchoridxs) || throw(ArgumentError("`seq2ranges` must be strictly increasing spans, got $seq2ranges"))
     ϕ = align_nw(seq1, seq2[anchoridxs], NWGapCosts{Float64}(open1=Inf); kwargs...)
     @assert last.(ϕ) == eachindex(anchoridxs)
     return [ϕ[i][1]:ϕ[i+1][1] for i in 1:2:length(ϕ)]
 end
-align_ranges(seq1::ChainLike, seq2::Chain, refranges::AbstractVector{<:AbstractUnitRange}; kwargs...) =
-    align_ranges(seq1, collectresidues(seq2), refranges; kwargs...)
+align_ranges(seq1::ChainLike, seq2::Chain, seq2ranges::AbstractVector{<:AbstractUnitRange}; kwargs...) =
+    align_ranges(seq1, collectresidues(seq2), seq2ranges; kwargs...)
 
 function score_nw(D::AbstractMatrix, gapcosts::NWGapCosts)
     Base.require_one_based_indexing(D)
 
@@ -32,7 +32,7 @@ end
 Return the latest version of all AlphaFold files in `dirname`.
 If `join` is `true`, then the full paths are returned.
 """
-function alphafoldfiles(dirname=pwd(); join::Bool=false)
+function alphafoldfiles(dirname::AbstractString=pwd(); join::Bool=false)
     latest = Dict{String,Int}()
     latestfn = Dict{String,String}()
     for fn in readdir(dirname)
@@ -51,19 +51,19 @@ function alphafoldfiles(dirname=pwd(); join::Bool=false)
 end
 
 """
-    msacode2structfile = alphafoldfiles(msa::AnnotatedMultipleSequenceAlignment, dirname=pwd())
+    msacode2structfile = alphafoldfiles(msa, dirname=pwd())
 
 Return a dictionary mapping `MSACode`s to the corresponding AlphaFold structure files.
 """
-function alphafoldfiles(msa::AnnotatedMultipleSequenceAlignment, dirname=pwd(); join::Bool=false)
+function alphafoldfiles(msa, dirname::AbstractString=pwd(); join::Bool=false)
     afs = alphafoldfiles(dirname)
     accesscode2idx = Dict{AccessionCode,Int}()
     for (i, af) in pairs(afs)
         ac = AccessionCode(match(rex_alphafold_pdbs, af).captures[1])
         accesscode2idx[ac] = i
     end
     msacode2structfile = Dict{MSACode,String}()
-    for name in sequencenames(msa)
+    for name in sequencekeys(msa)
         ac = AccessionCode(msa, name)
         if haskey(accesscode2idx, ac)
             fn = afs[accesscode2idx[ac]]
@@ -116,18 +116,19 @@ function try_download_alphafold(uniprotXname::AbstractString, path::AbstractStri
 end
 
 """
-    download_alphafolds(msa::AbstractMultipleSequenceAlignment; dirname=pwd())
+    download_alphafolds(msa; dirname=pwd())
     download_alphafolds(ids; dirname=pwd())
 
-Download all available [AlphaFold](https://alphafold.com/) structures for the sequences in `msa`.
-Missing entries are silently skipped.
+Download all available [AlphaFold](https://alphafold.com/) structures for the
+sequences in `msa`. Missing entries are silently skipped.
 
-If a `AbstractMultipleSequenceAlignment` is provided, the downloaded PDB file is checked to ensure that
-the residues in the MSA sequence match those in the PDB file. If they do not match, the PDB file is removed.
+If an `msa` is provided, each downloaded PDB file is checked to ensure that the
+residues in the MSA sequence match those in the PDB file. If they do not match,
+the PDB file is removed.
 """
-function download_alphafolds(msa::AbstractMultipleSequenceAlignment; dirname=pwd(), maxversion=nothing, kwargs...)
+function download_alphafolds(msa; dirname=pwd(), maxversion=nothing, kwargs...)
     maxversion === nothing || @warn "`download_alphafolds`: `maxversion` kwarg has no effect and is deprecated" maxlog=1
-    @showprogress 1 "Downloading AlphaFold files..." for name in sequencenames(msa)
+    @showprogress 1 "Downloading AlphaFold files..." for name in sequencekeys(msa)
         uname = AccessionCode(msa, name)
         url = query_alphafold_latest(uname)
         url === nothing && continue
@@ -136,14 +137,14 @@ function download_alphafolds(msa::AbstractMultipleSequenceAlignment; dirname=pwd
         if !isfile(path)
             Downloads.download(url, path)
         end
-        if !validate_seq_residues(getsequence(msa, name), getchain(path))
+        if !validate_seq_residues(msasequence(msa, name), getchain(path))
             @warn "Residues in $path do not match those in the sequence $name, removing PDB file"
             rm(path)
         end
     end
 end
 
-function download_alphafolds(ids; dirname=pwd(), kwargs...)
+function download_alphafolds(ids::AbstractVector{<:AbstractString}; dirname=pwd(), kwargs...)
     @showprogress 1 "Downloading AlphaFold files..." for uname in ids
         url = query_alphafold_latest(uname)
         url === nothing && continue