HolyLab
diff --git a/‎Project.toml‎
Lines changed: 4 additions & 2 deletions b/‎Project.toml‎
Lines changed: 4 additions & 2 deletions
diff --git a/‎ext/GPCRAnalysisMIToSExt.jl‎
Lines changed: 85 additions & 0 deletions b/‎ext/GPCRAnalysisMIToSExt.jl‎
Lines changed: 85 additions & 0 deletions
diff --git a/‎src/GPCRAnalysis.jl‎
Lines changed: 4 additions & 8 deletions b/‎src/GPCRAnalysis.jl‎
Lines changed: 4 additions & 8 deletions
diff --git a/‎src/align.jl‎
Lines changed: 7 additions & 7 deletions b/‎src/align.jl‎
Lines changed: 7 additions & 7 deletions
diff --git a/‎src/alphafold.jl‎
Lines changed: 14 additions & 13 deletions b/‎src/alphafold.jl‎
Lines changed: 14 additions & 13 deletions
diff --git a/‎src/analyze.jl‎
Lines changed: 7 additions & 9 deletions b/‎src/analyze.jl‎
Lines changed: 7 additions & 9 deletions
diff --git a/‎src/chimerax.jl‎
Lines changed: 5 additions & 5 deletions b/‎src/chimerax.jl‎
Lines changed: 5 additions & 5 deletions
@@ -17,7 +17,6 @@ Hungarian = "e91730f6-4275-51fb-a7a0-7064cfbd3b39"
 Interpolations = "a98d9a8b-a2ab-59e6-89dd-64a1c18fca59"
 JSON3 = "0f8b85d8-7281-11e9-16c2-39a750bddbf1"
 LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e"
-MIToS = "51bafb47-8a16-5ded-8b04-24ef4eede0b5"
 MultivariateStats = "6f286f6a-111f-5878-ab1e-185364afe411"
 MutableConvexHulls = "948c7aac-0e5e-4631-af23-7a6bb7a17825"
 OffsetArrays = "6fe1bfb0-de20-5000-8ca7-80f57d26f881"
@@ -30,9 +29,11 @@ TravelingSalesmanHeuristics = "8c8f4381-2cdd-507c-846c-be2bcff6f45f"
 [weakdeps]
 HiGHS = "87dc4568-4c63-4d18-b0c0-bb2238e4078b"
 JuMP = "4076af6c-e467-56ae-b986-b466b2749572"
+MIToS = "51bafb47-8a16-5ded-8b04-24ef4eede0b5"
 
 [extensions]
 GPCRAnalysisJuMPExt = ["JuMP", "HiGHS"]
+GPCRAnalysisMIToSExt = "MIToS"
 
 [compat]
 BioStructures = "4.2"
@@ -65,7 +66,8 @@ julia = "1.10"
 HiGHS = "87dc4568-4c63-4d18-b0c0-bb2238e4078b"
 InvertedIndices = "41ab1584-1d38-5bbf-9106-f11c6c58b48f"
 JuMP = "4076af6c-e467-56ae-b986-b466b2749572"
+MIToS = "51bafb47-8a16-5ded-8b04-24ef4eede0b5"
 Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40"
 
 [targets]
-test = ["HiGHS", "InvertedIndices", "JuMP", "Test"]
+test = ["HiGHS", "InvertedIndices", "JuMP", "MIToS", "Test"]
@@ -0,0 +1,85 @@
+module GPCRAnalysisMIToSExt
+
+using GPCRAnalysis
+using Downloads
+using BioStructures
+using ProgressMeter
+
+using GPCRAnalysis: ChainLike, ResidueLike, StructureLike, _entropy, validate_seq_residues, rex_alphafold_pdbs
+
+using MIToS: MIToS, Pfam, MSA
+using MIToS.MSA: AbstractMultipleSequenceAlignment, AnnotatedAlignedSequence, AnnotatedMultipleSequenceAlignment,
+                 ReducedAlphabet, ResidueAlphabet, GAP, XAA
+using MIToS.MSA: getsequence, getannotsequence, getsequencemapping, getresidues, three2residue, sequencenames,
+                 filtersequences, filtersequences!, percentsimilarity
+
+
+# Low-level API implementation
+GPCRAnalysis.sequenceindexes(msaseq::AnnotatedAlignedSequence) = getsequencemapping(msaseq)
+GPCRAnalysis.sequenceindexes(msaseq::AbstractMultipleSequenceAlignment, i::Int) = getsequencemapping(msaseq, i)
+GPCRAnalysis.isgap(res::MSA.Residue) = res == GAP
+GPCRAnalysis.isunknown(res::MSA.Residue) = res == XAA
+GPCRAnalysis.sequencekeys(msa::AbstractMultipleSequenceAlignment) = sequencenames(msa)
+GPCRAnalysis.msasequence(msa::AbstractMultipleSequenceAlignment, key) = getsequence(msa, key)
+GPCRAnalysis.residuematrix(msa::AbstractMultipleSequenceAlignment) = getresidues(msa)
+GPCRAnalysis.subseqs(msa::AbstractMultipleSequenceAlignment, rowmask::AbstractVector{Bool})  = filtersequences(msa, rowmask)
+GPCRAnalysis.subseqs!(msa::AbstractMultipleSequenceAlignment, rowmask::AbstractVector{Bool}) = filtersequences!(msa, rowmask)
+function GPCRAnalysis.subseqs(msa::AbstractMultipleSequenceAlignment, rowindexes::AbstractVector{Int})
+    rowmask = falses(nsequences(msa))
+    rowmask[rowindexes] .= true
+    return subseqs(msa, rowmask)
+end
+function GPCRAnalysis.subseqs!(msa::AbstractMultipleSequenceAlignment, rowindexes::AbstractVector{Int})
+    rowmask = falses(nsequences(msa))
+    rowmask[rowindexes] .= true
+    return subseqs!(msa, rowmask)
+end
+GPCRAnalysis.percent_similarity(msa::AbstractMultipleSequenceAlignment) = percentsimilarity(msa)
+
+Base.getindex(msa::AbstractMultipleSequenceAlignment, seqname::MSACode) = getsequence(msa, seqname.name)
+Base.getindex(msa::AbstractMultipleSequenceAlignment, seqname::AccessionCode) = getsequence(msa, MSACode(msa, seqname).name)
+
+function GPCRAnalysis.AccessionCode(msa::AnnotatedMultipleSequenceAlignment, seqname::AbstractString)
+    AccessionCode(uniprotX(getannotsequence(msa, seqname, "AC", seqname)))
+end
+GPCRAnalysis.AccessionCode(msa::AnnotatedMultipleSequenceAlignment, seqname::MSACode) = AccessionCode(msa, seqname.name)
+GPCRAnalysis.AccessionCode(::AnnotatedMultipleSequenceAlignment, seqname::AccessionCode) = seqname
+
+function GPCRAnalysis.MSACode(msa::AnnotatedMultipleSequenceAlignment, accession::AbstractString)
+    seqnames = sequencenames(msa)
+    return MSACode(seqnames[findfirst(x -> AccessionCode(msa, x).name == accession, seqnames)])
+end
+GPCRAnalysis.MSACode(msa::AnnotatedMultipleSequenceAlignment, accession::AccessionCode) = MSACode(msa, accession.name)
+GPCRAnalysis.MSACode(::AnnotatedMultipleSequenceAlignment, accession::MSACode) = accession
+
+GPCRAnalysis.SequenceMapping(seq::AnnotatedAlignedSequence) = SequenceMapping(getsequencemapping(seq))
+
+# Move this to MIToS?
+if !hasmethod(getsequencemapping, Tuple{AnnotatedAlignedSequence})
+    function MIToS.MSA.getsequencemapping(seq::AnnotatedAlignedSequence)
+        getsequencemapping(seq, sequencenames(seq)[1])
+    end
+    function MIToS.MSA.getsequencemapping(msa::Union{AnnotatedAlignedSequence,AnnotatedMultipleSequenceAlignment}, seq_id::String)
+        MIToS.MSA._str2int_mapping(getannotsequence(msa, seq_id, "SeqMap"))
+    end
+    function MIToS.MSA.getsequencemapping(msa::AnnotatedMultipleSequenceAlignment, seqid::Regex)
+        id = findfirst(str -> occursin(seqid, str), sequencenames(msa))
+        getsequencemapping(msa, id)
+    end
+end
+
+const reduced_code = ReducedAlphabet("(AILMV)(NQST)(RHK)(DE)(FWY)CGP")
+
+"""
+    columnwise_entropy(msa, aacode = reduced_code)
+
+Call `columnwise_entropy` after mapping each residue through `aacode`.
+
+The default code is `ReducedAlphabet("(AILMV)(NQST)(RHK)(DE)(FWY)CGP")`, which
+groups residues into categories hydrophobic, polar, charged, aromatic, and
+"special."
+"""
+GPCRAnalysis.columnwise_entropy(msa::AbstractMultipleSequenceAlignment, aacode::ResidueAlphabet=reduced_code) =
+    GPCRAnalysis.columnwise_entropy(r -> aacode[r], msa)
+
+end
@@ -5,15 +5,9 @@ using Statistics
 using LinearAlgebra
 
 using BioStructures
+using BioStructures: amino_acid_data
 using FASTX
 
-using MIToS: MIToS, Pfam, MSA
-using MIToS.MSA: AbstractMultipleSequenceAlignment, AnnotatedAlignedSequence, AnnotatedMultipleSequenceAlignment,
-                 ReducedAlphabet, GAP, XAA
-using MIToS.MSA: getsequence, getannotsequence, getsequencemapping, getresidues, three2residue, sequencenames,
-                 filtersequences!, percentsimilarity
-using MIToS.PDB: vanderwaalsradius, ishydrophobic, isaromatic, iscationic, isanionic, ishbonddonor, ishbondacceptor
-
 using MultivariateStats
 using Distances
 using OffsetArrays
@@ -57,9 +51,11 @@ export sidechaincentroid, scvector, inward_tm_residues, inward_ecl_residues
 export features_from_structure
 export forcecomponents, optimize_weights, forcedict
 
+include("consts.jl")
 include("naming_conventions.jl")
-include("utils.jl")
 include("msa.jl")
+include("utils.jl")
+include("query.jl")
 include("alphafold.jl")
 include("analyze.jl")
 include("tmalign.jl")
 
@@ -279,22 +279,22 @@ end
 """
     seqtms = align_ranges(seq1, seq2, seq2ranges::AbstractVector{<:AbstractUnitRange})
 
-Transfer `refranges`, a list of reside index spans in `seq2`, to `seq1`. `seq1` and
+Transfer `seq2ranges`, a list of reside index spans in `seq2`, to `seq1`. `seq1` and
 `seq2` must be spatially aligned, and the assignment is made by minimizing
 inter-chain distance subject to the constraint of preserving sequence order.
 """
-function align_ranges(seq1::ChainLike, seq2::AbstractVector{<:AbstractResidue}, refranges::AbstractVector{<:AbstractUnitRange}; kwargs...)
-    anchoridxs = sizehint!(Int[], length(refranges)*2)
-    for r in refranges
+function align_ranges(seq1::ChainLike, seq2::AbstractVector{<:AbstractResidue}, seq2ranges::AbstractVector{<:AbstractUnitRange}; kwargs...)
+    anchoridxs = sizehint!(Int[], length(seq2ranges)*2)
+    for r in seq2ranges
         push!(anchoridxs, first(r), last(r))
     end
-    issorted(anchoridxs) || throw(ArgumentError("`refranges` must be strictly increasing spans, got $refranges"))
+    issorted(anchoridxs) || throw(ArgumentError("`seq2ranges` must be strictly increasing spans, got $seq2ranges"))
     ϕ = align_nw(seq1, seq2[anchoridxs], NWGapCosts{Float64}(open1=Inf); kwargs...)
     @assert last.(ϕ) == eachindex(anchoridxs)
     return [ϕ[i][1]:ϕ[i+1][1] for i in 1:2:length(ϕ)]
 end
-align_ranges(seq1::ChainLike, seq2::Chain, refranges::AbstractVector{<:AbstractUnitRange}; kwargs...) =
-    align_ranges(seq1, collectresidues(seq2), refranges; kwargs...)
+align_ranges(seq1::ChainLike, seq2::Chain, seq2ranges::AbstractVector{<:AbstractUnitRange}; kwargs...) =
+    align_ranges(seq1, collectresidues(seq2), seq2ranges; kwargs...)
 
 function score_nw(D::AbstractMatrix, gapcosts::NWGapCosts)
     Base.require_one_based_indexing(D)
 
@@ -32,7 +32,7 @@ end
 Return the latest version of all AlphaFold files in `dirname`.
 If `join` is `true`, then the full paths are returned.
 """
-function alphafoldfiles(dirname=pwd(); join::Bool=false)
+function alphafoldfiles(dirname::AbstractString=pwd(); join::Bool=false)
     latest = Dict{String,Int}()
     latestfn = Dict{String,String}()
     for fn in readdir(dirname)
@@ -51,19 +51,19 @@ function alphafoldfiles(dirname=pwd(); join::Bool=false)
 end
 
 """
-    msacode2structfile = alphafoldfiles(msa::AnnotatedMultipleSequenceAlignment, dirname=pwd())
+    msacode2structfile = alphafoldfiles(msa, dirname=pwd())
 
 Return a dictionary mapping `MSACode`s to the corresponding AlphaFold structure files.
 """
-function alphafoldfiles(msa::AnnotatedMultipleSequenceAlignment, dirname=pwd(); join::Bool=false)
+function alphafoldfiles(msa, dirname::AbstractString=pwd(); join::Bool=false)
     afs = alphafoldfiles(dirname)
     accesscode2idx = Dict{AccessionCode,Int}()
     for (i, af) in pairs(afs)
         ac = AccessionCode(match(rex_alphafold_pdbs, af).captures[1])
         accesscode2idx[ac] = i
     end
     msacode2structfile = Dict{MSACode,String}()
-    for name in sequencenames(msa)
+    for name in sequencekeys(msa)
         ac = AccessionCode(msa, name)
         if haskey(accesscode2idx, ac)
             fn = afs[accesscode2idx[ac]]
@@ -116,18 +116,19 @@ function try_download_alphafold(uniprotXname::AbstractString, path::AbstractStri
 end
 
 """
-    download_alphafolds(msa::AbstractMultipleSequenceAlignment; dirname=pwd())
+    download_alphafolds(msa; dirname=pwd())
     download_alphafolds(ids; dirname=pwd())
 
-Download all available [AlphaFold](https://alphafold.com/) structures for the sequences in `msa`.
-Missing entries are silently skipped.
+Download all available [AlphaFold](https://alphafold.com/) structures for the
+sequences in `msa`. Missing entries are silently skipped.
 
-If a `AbstractMultipleSequenceAlignment` is provided, the downloaded PDB file is checked to ensure that
-the residues in the MSA sequence match those in the PDB file. If they do not match, the PDB file is removed.
+If an `msa` is provided, each downloaded PDB file is checked to ensure that the
+residues in the MSA sequence match those in the PDB file. If they do not match,
+the PDB file is removed.
 """
-function download_alphafolds(msa::AbstractMultipleSequenceAlignment; dirname=pwd(), maxversion=nothing, kwargs...)
+function download_alphafolds(msa; dirname=pwd(), maxversion=nothing, kwargs...)
     maxversion === nothing || @warn "`download_alphafolds`: `maxversion` kwarg has no effect and is deprecated" maxlog=1
-    @showprogress 1 "Downloading AlphaFold files..." for name in sequencenames(msa)
+    @showprogress 1 "Downloading AlphaFold files..." for name in sequencekeys(msa)
         uname = AccessionCode(msa, name)
         url = query_alphafold_latest(uname)
         url === nothing && continue
@@ -136,14 +137,14 @@ function download_alphafolds(msa::AbstractMultipleSequenceAlignment; dirname=pwd
         if !isfile(path)
             Downloads.download(url, path)
         end
-        if !validate_seq_residues(getsequence(msa, name), getchain(path))
+        if !validate_seq_residues(msasequence(msa, name), getchain(path))
             @warn "Residues in $path do not match those in the sequence $name, removing PDB file"
             rm(path)
         end
     end
 end
 
-function download_alphafolds(ids; dirname=pwd(), kwargs...)
+function download_alphafolds(ids::AbstractVector{<:AbstractString}; dirname=pwd(), kwargs...)
     @showprogress 1 "Downloading AlphaFold files..." for uname in ids
         url = query_alphafold_latest(uname)
         url === nothing && continue
 
@@ -1,12 +1,12 @@
 """
-    X = project_sequences(msa::AbstractMultipleSequenceAlignment; fracvar::Real = 0.9)
+    X = project_sequences(msa; fracvar::Real = 0.9)
 
 Perform a classical multidimensional scaling analysis to project the sequences in `msa` to a space
 in which pairwise distances approximately reproduce `100 - percentsimilarity(seq1, seq2)`.
 The dimensionality is chosen to reconstruction `fracvar` of the variance.
 """
-function project_sequences(msa::AbstractMultipleSequenceAlignment; fracvar::Real = 0.9)
-    sim = percentsimilarity(msa)
+function project_sequences(msa; fracvar::Real = 0.9)
+    sim = percent_similarity(msa)
     D = 100 .- Matrix(sim)
     f = fit(MDS, D; distances=true)
     # Capture sufficient variance
@@ -17,8 +17,6 @@ function project_sequences(msa::AbstractMultipleSequenceAlignment; fracvar::Real
     return X[1:nd, :]
 end
 
-const reduced_code = ReducedAlphabet("(AILMV)(NQST)(RHK)(DE)(FWY)CGP")
-
 function _entropy(v)
     count = countitems(v)
     e = 0.0
@@ -30,14 +28,14 @@ function _entropy(v)
 end
 
 """
-    columnwise_entropy(msa::AbstractMultipleSequenceAlignment, aacode = ReducedAlphabet("(AILMV)(NQST)(RHK)(DE)(FWY)CGP"))
+    columnwise_entropy(f, msa)
 
-Compute the entropy of each column in an MSA. Low entropy indicates high conservation.
+Compute the entropy of each column in an MSA, after applying `f` to each residue. Low entropy indicates high conservation.
 
 Unmatched entries (`'-'` residues) contribute to the entropy calculation as if they were an ordinary residue.
 """
-function columnwise_entropy(msa::AbstractMultipleSequenceAlignment, aacode=reduced_code)
-    resnum = map(r -> aacode[r], getresidues(msa))
+function columnwise_entropy(f, msa)
+    resnum = map(f, residuematrix(msa))
     return map(_entropy, eachcol(resnum))
 end
 
 
@@ -32,7 +32,7 @@ function chimerax_script(scriptfilename, struct_filenames, ridxs::AbstractVector
 end
 
 """
-    chimerax_script(scriptfilename, uprot_list, msa::AnnotatedMultipleSequenceAlignment, colidxs;
+    chimerax_script(scriptfilename, uprot_list, msa, colidxs;
                     dir=pwd(), align=true, chain_transparency=80, styles=Dict{Int,String}(), extras=String[])
 
 Create a [chimerax](https://www.cgl.ucsf.edu/chimerax/) visualization script
@@ -63,17 +63,17 @@ chimerax_script("myscript.cxc", ["P15409"], msa, [i1, i2, i3])
 
 where `i1` through `i3` are column-indices in the `msa` that you'd like to view.
 """
-function chimerax_script(scriptfilename, uprot_list, msa::AnnotatedMultipleSequenceAlignment, colidxs;
+function chimerax_script(scriptfilename, uprot_list, msa, colidxs;
                          dir=pwd(), styles=Dict{Int,String}(), kwargs...)
     ridxs = [Int[] for _ in 1:length(uprot_list)]
     struct_filenames = Vector{String}(undef, length(uprot_list))
     rcstyles = Dict{Tuple{Int,Int},String}()
     afs = alphafoldfiles(msa, dir; join=true)
-    uprot2msaidx = Dict{AccessionCode,Int}(AccessionCode(msa, name) => i for (i, name) in enumerate(sequencenames(msa)))
+    uprot2msaidx = Dict{AccessionCode,Int}(AccessionCode(msa, name) => i for (i, name) in enumerate(sequencekeys(msa)))
     for (i, p) in enumerate(uprot_list)
         j = uprot2msaidx[AccessionCode(p)]
-        struct_filenames[i] = afs[MSACode(sequencenames(msa)[j])]
-        sm = getsequencemapping(msa, j)
+        struct_filenames[i] = afs[MSACode(sequencekeys(msa)[j])]
+        sm = sequenceindexes(msa, j)
         for (j, c) in enumerate(colidxs)
             ridx = sm[c]
             if iszero(ridx)