HolyLab
diff --git a/‎Project.toml‎
Lines changed: 4 additions & 2 deletions b/‎Project.toml‎
Lines changed: 4 additions & 2 deletions
diff --git a/‎ext/GPCRAnalysisMIToSExt.jl‎
Lines changed: 171 additions & 0 deletions b/‎ext/GPCRAnalysisMIToSExt.jl‎
Lines changed: 171 additions & 0 deletions
diff --git a/‎src/GPCRAnalysis.jl‎
Lines changed: 3 additions & 6 deletions b/‎src/GPCRAnalysis.jl‎
Lines changed: 3 additions & 6 deletions
diff --git a/‎src/align.jl‎
Lines changed: 7 additions & 7 deletions b/‎src/align.jl‎
Lines changed: 7 additions & 7 deletions
diff --git a/‎src/alphafold.jl‎
Lines changed: 1 addition & 42 deletions b/‎src/alphafold.jl‎
Lines changed: 1 addition & 42 deletions
diff --git a/‎src/analyze.jl‎
Lines changed: 5 additions & 10 deletions b/‎src/analyze.jl‎
Lines changed: 5 additions & 10 deletions
@@ -17,7 +17,6 @@ Hungarian = "e91730f6-4275-51fb-a7a0-7064cfbd3b39"
 Interpolations = "a98d9a8b-a2ab-59e6-89dd-64a1c18fca59"
 JSON3 = "0f8b85d8-7281-11e9-16c2-39a750bddbf1"
 LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e"
-MIToS = "51bafb47-8a16-5ded-8b04-24ef4eede0b5"
 MultivariateStats = "6f286f6a-111f-5878-ab1e-185364afe411"
 MutableConvexHulls = "948c7aac-0e5e-4631-af23-7a6bb7a17825"
 OffsetArrays = "6fe1bfb0-de20-5000-8ca7-80f57d26f881"
@@ -30,9 +29,11 @@ TravelingSalesmanHeuristics = "8c8f4381-2cdd-507c-846c-be2bcff6f45f"
 [weakdeps]
 HiGHS = "87dc4568-4c63-4d18-b0c0-bb2238e4078b"
 JuMP = "4076af6c-e467-56ae-b986-b466b2749572"
+MIToS = "51bafb47-8a16-5ded-8b04-24ef4eede0b5"
 
 [extensions]
 GPCRAnalysisJuMPExt = ["JuMP", "HiGHS"]
+GPCRAnalysisMIToSExt = "MIToS"
 
 [compat]
 BioStructures = "4.2"
@@ -65,7 +66,8 @@ julia = "1.10"
 HiGHS = "87dc4568-4c63-4d18-b0c0-bb2238e4078b"
 InvertedIndices = "41ab1584-1d38-5bbf-9106-f11c6c58b48f"
 JuMP = "4076af6c-e467-56ae-b986-b466b2749572"
+MIToS = "51bafb47-8a16-5ded-8b04-24ef4eede0b5"
 Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40"
 
 [targets]
-test = ["HiGHS", "InvertedIndices", "JuMP", "Test"]
+test = ["HiGHS", "InvertedIndices", "JuMP", "MIToS", "Test"]
@@ -0,0 +1,171 @@
+module GPCRAnalysisMIToSExt
+
+using GPCRAnalysis
+using Downloads
+using BioStructures
+using ProgressMeter
+
+using GPCRAnalysis: ChainLike, ResidueLike, StructureLike, _entropy, validate_seq_residues, rex_alphafold_pdbs
+
+using MIToS: MIToS, Pfam, MSA
+using MIToS.MSA: AbstractMultipleSequenceAlignment, AnnotatedAlignedSequence, AnnotatedMultipleSequenceAlignment,
+                 ReducedAlphabet, GAP, XAA
+using MIToS.MSA: getsequence, getannotsequence, getsequencemapping, getresidues, three2residue, sequencenames,
+                 filtersequences!, percentsimilarity
+
+
+Base.getindex(msa::AbstractMultipleSequenceAlignment, seqname::MSACode) = getsequence(msa, seqname.name)
+Base.getindex(msa::AbstractMultipleSequenceAlignment, seqname::AccessionCode) = getsequence(msa, MSACode(msa, seqname).name)
+
+MIToS.PDB.ishydrophobic(a::AbstractAtom, rname::AbstractString) = (rname, atomname(a)) in MIToS.PDB._hydrophobic
+MIToS.PDB.isaromatic(a::AbstractAtom, rname::AbstractString) = (rname, atomname(a)) in MIToS.PDB._aromatic
+MIToS.PDB.iscationic(a::AbstractAtom, rname::AbstractString) = (rname, atomname(a)) in MIToS.PDB._cationic
+MIToS.PDB.isanionic(a::AbstractAtom, rname::AbstractString) = (rname, atomname(a)) in MIToS.PDB._anionic
+MIToS.PDB.ishbonddonor(a::AbstractAtom, rname::AbstractString) = (rname, atomname(a)) in keys(MIToS.PDB._hbond_donor)
+MIToS.PDB.ishbondacceptor(a::AbstractAtom, rname::AbstractString) = (rname, atomname(a)) in keys(MIToS.PDB._hbond_acceptor)
+
+function GPCRAnalysis.AccessionCode(msa::AnnotatedMultipleSequenceAlignment, seqname::AbstractString)
+    AccessionCode(uniprotX(getannotsequence(msa, seqname, "AC", seqname)))
+end
+GPCRAnalysis.AccessionCode(msa::AnnotatedMultipleSequenceAlignment, seqname::MSACode) = AccessionCode(msa, seqname.name)
+GPCRAnalysis.AccessionCode(::AnnotatedMultipleSequenceAlignment, seqname::AccessionCode) = seqname
+
+function GPCRAnalysis.MSACode(msa::AnnotatedMultipleSequenceAlignment, accession::AbstractString)
+    seqnames = sequencenames(msa)
+    return MSACode(seqnames[findfirst(x -> AccessionCode(msa, x).name == accession, seqnames)])
+end
+GPCRAnalysis.MSACode(msa::AnnotatedMultipleSequenceAlignment, accession::AccessionCode) = MSACode(msa, accession.name)
+GPCRAnalysis.MSACode(::AnnotatedMultipleSequenceAlignment, accession::MSACode) = accession
+
+GPCRAnalysis.SequenceMapping(seq::AnnotatedAlignedSequence) = SequenceMapping(getsequencemapping(seq))
+
+GPCRAnalysis.percent_similarity(msa::AbstractMultipleSequenceAlignment) = percentsimilarity(msa)
+
+# Move this to MIToS?
+if !hasmethod(getsequencemapping, Tuple{AnnotatedAlignedSequence})
+    function MIToS.MSA.getsequencemapping(seq::AnnotatedAlignedSequence)
+        getsequencemapping(seq, sequencenames(seq)[1])
+    end
+    function MIToS.MSA.getsequencemapping(msa::Union{AnnotatedAlignedSequence,AnnotatedMultipleSequenceAlignment}, seq_id::String)
+        MIToS.MSA._str2int_mapping(getannotsequence(msa, seq_id, "SeqMap"))
+    end
+    function MIToS.MSA.getsequencemapping(msa::AnnotatedMultipleSequenceAlignment, seqid::Regex)
+        id = findfirst(str -> occursin(seqid, str), sequencenames(msa))
+        getsequencemapping(msa, id)
+    end
+end
+
+function GPCRAnalysis.validate_seq_residues(seq::AnnotatedAlignedSequence, chain)
+    for (i, r) in zip(getsequencemapping(seq), seq)
+        (r == GAP || r == XAA) && continue
+        res = three2residue(String(resname(chain[i])))
+        res == r || return false
+    end
+    return true
+end
+
+const reduced_code = ReducedAlphabet("(AILMV)(NQST)(RHK)(DE)(FWY)CGP")
+
+function GPCRAnalysis.columnwise_entropy(msa::AbstractMultipleSequenceAlignment, aacode=reduced_code)
+    resnum = map(r -> aacode[r], getresidues(msa))
+    return map(_entropy, eachcol(resnum))
+end
+
+function GPCRAnalysis.filter_long!(msa::AbstractMultipleSequenceAlignment, minres::Real)
+    # Get rid of short sequences
+    nresidues = map(eachrow(msa)) do v
+        sum(!=(MSA.Residue('-')), v)
+    end
+    mask = nresidues .> minres
+    filtersequences!(msa, mask)
+end
+
+function GPCRAnalysis.aa_properties_matrix(msa::AbstractMultipleSequenceAlignment)
+    props = copy(aa_properties_zscored)
+    props[Char(GAP)] = zero(valtype(props))
+    props['X'] = zero(valtype(props))
+    return [props[Char(residue)] for residue in permutedims(msa)]
+end
+
+function GPCRAnalysis.chimerax_script(scriptfilename, uprot_list, msa::AnnotatedMultipleSequenceAlignment, colidxs;
+                         dir=pwd(), styles=Dict{Int,String}(), kwargs...)
+    ridxs = [Int[] for _ in 1:length(uprot_list)]
+    struct_filenames = Vector{String}(undef, length(uprot_list))
+    rcstyles = Dict{Tuple{Int,Int},String}()
+    afs = alphafoldfiles(msa, dir; join=true)
+    uprot2msaidx = Dict{AccessionCode,Int}(AccessionCode(msa, name) => i for (i, name) in enumerate(sequencenames(msa)))
+    for (i, p) in enumerate(uprot_list)
+        j = uprot2msaidx[AccessionCode(p)]
+        struct_filenames[i] = afs[MSACode(sequencenames(msa)[j])]
+        sm = getsequencemapping(msa, j)
+        for (j, c) in enumerate(colidxs)
+            ridx = sm[c]
+            if iszero(ridx)
+                @warn "column $c not set in $p"
+                continue
+            end
+            push!(ridxs[i], ridx)
+            style = get(styles, c, nothing)
+            if style !== nothing
+                rcstyles[(i, j)] = style
+            end
+        end
+    end
+    return chimerax_script(scriptfilename, struct_filenames, ridxs; styles=rcstyles, kwargs...)
+end
+
+function GPCRAnalysis.filter_species!(msa::AbstractMultipleSequenceAlignment, speciesname::AbstractString)
+    mask = map(x -> species(x) == speciesname, sequencenames(msa))
+    filtersequences!(msa, mask)
+end
+
+GPCRAnalysis.gapres(::Type{MSA.Residue}) = MSA.Residue('-')
+
+function GPCRAnalysis.StructAlign(struct1::ChainLike, struct2::ChainLike,
+                     align1::AbstractVector{MSA.Residue}, align2::AbstractVector{MSA.Residue},
+                     quality)
+    StructAlign(MapAlign(struct1, align1, quality), MapAlign(struct2, align2, quality))
+end
+
+"""
+    msacode2structfile = alphafoldfiles(msa::AnnotatedMultipleSequenceAlignment, dirname=pwd())
+
+Return a dictionary mapping `MSACode`s to the corresponding AlphaFold structure files.
+"""
+function GPCRAnalysis.alphafoldfiles(msa::AnnotatedMultipleSequenceAlignment, dirname=pwd(); join::Bool=false)
+    afs = alphafoldfiles(dirname)
+    accesscode2idx = Dict{AccessionCode,Int}()
+    for (i, af) in pairs(afs)
+        ac = AccessionCode(match(rex_alphafold_pdbs, af).captures[1])
+        accesscode2idx[ac] = i
+    end
+    msacode2structfile = Dict{MSACode,String}()
+    for name in sequencenames(msa)
+        ac = AccessionCode(msa, name)
+        if haskey(accesscode2idx, ac)
+            fn = afs[accesscode2idx[ac]]
+            msacode2structfile[MSACode(name)] = join ? joinpath(dirname, fn) : fn
+        end
+    end
+    return msacode2structfile
+end
+
+function GPCRAnalysis.download_alphafolds(msa::AbstractMultipleSequenceAlignment; dirname=pwd(), maxversion=nothing, kwargs...)
+    maxversion === nothing || @warn "`download_alphafolds`: `maxversion` kwarg has no effect and is deprecated" maxlog=1
+    @showprogress 1 "Downloading AlphaFold files..." for name in sequencenames(msa)
+        uname = AccessionCode(msa, name)
+        url = query_alphafold_latest(uname)
+        url === nothing && continue
+        fn = split(url, '/')[end]
+        path = joinpath(dirname, fn)
+        if !isfile(path)
+            Downloads.download(url, path)
+        end
+        if !validate_seq_residues(getsequence(msa, name), getchain(path))
+            @warn "Residues in $path do not match those in the sequence $name, removing PDB file"
+            rm(path)
+        end
+    end
+end
+
+end
@@ -5,14 +5,10 @@ using Statistics
 using LinearAlgebra
 
 using BioStructures
+using BioStructures: amino_acid_data
 using FASTX
 
-using MIToS: MIToS, Pfam, MSA
-using MIToS.MSA: AbstractMultipleSequenceAlignment, AnnotatedAlignedSequence, AnnotatedMultipleSequenceAlignment,
-                 ReducedAlphabet, GAP, XAA
-using MIToS.MSA: getsequence, getannotsequence, getsequencemapping, getresidues, three2residue, sequencenames,
-                 filtersequences!, percentsimilarity
-using MIToS.PDB: vanderwaalsradius, ishydrophobic, isaromatic, iscationic, isanionic, ishbonddonor, ishbondacceptor
+# using MIToS.PDB: vanderwaalsradius, ishydrophobic, isaromatic, iscationic, isanionic, ishbonddonor, ishbondacceptor
 
 using MultivariateStats
 using Distances
@@ -57,6 +53,7 @@ export sidechaincentroid, scvector, inward_tm_residues, inward_ecl_residues
 export features_from_structure
 export forcecomponents, optimize_weights, forcedict
 
+include("consts.jl")
 include("naming_conventions.jl")
 include("utils.jl")
 include("msa.jl")
 
@@ -279,22 +279,22 @@ end
 """
     seqtms = align_ranges(seq1, seq2, seq2ranges::AbstractVector{<:AbstractUnitRange})
 
-Transfer `refranges`, a list of reside index spans in `seq2`, to `seq1`. `seq1` and
+Transfer `seq2ranges`, a list of reside index spans in `seq2`, to `seq1`. `seq1` and
 `seq2` must be spatially aligned, and the assignment is made by minimizing
 inter-chain distance subject to the constraint of preserving sequence order.
 """
-function align_ranges(seq1::ChainLike, seq2::AbstractVector{<:AbstractResidue}, refranges::AbstractVector{<:AbstractUnitRange}; kwargs...)
-    anchoridxs = sizehint!(Int[], length(refranges)*2)
-    for r in refranges
+function align_ranges(seq1::ChainLike, seq2::AbstractVector{<:AbstractResidue}, seq2ranges::AbstractVector{<:AbstractUnitRange}; kwargs...)
+    anchoridxs = sizehint!(Int[], length(seq2ranges)*2)
+    for r in seq2ranges
         push!(anchoridxs, first(r), last(r))
     end
-    issorted(anchoridxs) || throw(ArgumentError("`refranges` must be strictly increasing spans, got $refranges"))
+    issorted(anchoridxs) || throw(ArgumentError("`seq2ranges` must be strictly increasing spans, got $seq2ranges"))
     ϕ = align_nw(seq1, seq2[anchoridxs], NWGapCosts{Float64}(open1=Inf); kwargs...)
     @assert last.(ϕ) == eachindex(anchoridxs)
     return [ϕ[i][1]:ϕ[i+1][1] for i in 1:2:length(ϕ)]
 end
-align_ranges(seq1::ChainLike, seq2::Chain, refranges::AbstractVector{<:AbstractUnitRange}; kwargs...) =
-    align_ranges(seq1, collectresidues(seq2), refranges; kwargs...)
+align_ranges(seq1::ChainLike, seq2::Chain, seq2ranges::AbstractVector{<:AbstractUnitRange}; kwargs...) =
+    align_ranges(seq1, collectresidues(seq2), seq2ranges; kwargs...)
 
 function score_nw(D::AbstractMatrix, gapcosts::NWGapCosts)
     Base.require_one_based_indexing(D)
 
@@ -50,29 +50,6 @@ function alphafoldfiles(dirname=pwd(); join::Bool=false)
     return join ? [joinpath(dirname, fn) for fn in fns] : fns
 end
 
-"""
-    msacode2structfile = alphafoldfiles(msa::AnnotatedMultipleSequenceAlignment, dirname=pwd())
-
-Return a dictionary mapping `MSACode`s to the corresponding AlphaFold structure files.
-"""
-function alphafoldfiles(msa::AnnotatedMultipleSequenceAlignment, dirname=pwd(); join::Bool=false)
-    afs = alphafoldfiles(dirname)
-    accesscode2idx = Dict{AccessionCode,Int}()
-    for (i, af) in pairs(afs)
-        ac = AccessionCode(match(rex_alphafold_pdbs, af).captures[1])
-        accesscode2idx[ac] = i
-    end
-    msacode2structfile = Dict{MSACode,String}()
-    for name in sequencenames(msa)
-        ac = AccessionCode(msa, name)
-        if haskey(accesscode2idx, ac)
-            fn = afs[accesscode2idx[ac]]
-            msacode2structfile[MSACode(name)] = join ? joinpath(dirname, fn) : fn
-        end
-    end
-    return msacode2structfile
-end
-
 """
     url = query_alphafold_latest(uniprotXname; format="cif")
 
@@ -116,7 +93,7 @@ function try_download_alphafold(uniprotXname::AbstractString, path::AbstractStri
 end
 
 """
-    download_alphafolds(msa::AbstractMultipleSequenceAlignment; dirname=pwd())
+    download_alphafolds(msa; dirname=pwd())
     download_alphafolds(ids; dirname=pwd())
 
 Download all available [AlphaFold](https://alphafold.com/) structures for the sequences in `msa`.
@@ -125,24 +102,6 @@ Missing entries are silently skipped.
 If a `AbstractMultipleSequenceAlignment` is provided, the downloaded PDB file is checked to ensure that
 the residues in the MSA sequence match those in the PDB file. If they do not match, the PDB file is removed.
 """
-function download_alphafolds(msa::AbstractMultipleSequenceAlignment; dirname=pwd(), maxversion=nothing, kwargs...)
-    maxversion === nothing || @warn "`download_alphafolds`: `maxversion` kwarg has no effect and is deprecated" maxlog=1
-    @showprogress 1 "Downloading AlphaFold files..." for name in sequencenames(msa)
-        uname = AccessionCode(msa, name)
-        url = query_alphafold_latest(uname)
-        url === nothing && continue
-        fn = split(url, '/')[end]
-        path = joinpath(dirname, fn)
-        if !isfile(path)
-            Downloads.download(url, path)
-        end
-        if !validate_seq_residues(getsequence(msa, name), getchain(path))
-            @warn "Residues in $path do not match those in the sequence $name, removing PDB file"
-            rm(path)
-        end
-    end
-end
-
 function download_alphafolds(ids; dirname=pwd(), kwargs...)
     @showprogress 1 "Downloading AlphaFold files..." for uname in ids
         url = query_alphafold_latest(uname)
 
@@ -1,12 +1,12 @@
 """
-    X = project_sequences(msa::AbstractMultipleSequenceAlignment; fracvar::Real = 0.9)
+    X = project_sequences(msa; fracvar::Real = 0.9)
 
 Perform a classical multidimensional scaling analysis to project the sequences in `msa` to a space
 in which pairwise distances approximately reproduce `100 - percentsimilarity(seq1, seq2)`.
 The dimensionality is chosen to reconstruction `fracvar` of the variance.
 """
-function project_sequences(msa::AbstractMultipleSequenceAlignment; fracvar::Real = 0.9)
-    sim = percentsimilarity(msa)
+function project_sequences(msa; fracvar::Real = 0.9)
+    sim = percent_similarity(msa)
     D = 100 .- Matrix(sim)
     f = fit(MDS, D; distances=true)
     # Capture sufficient variance
@@ -17,8 +17,6 @@ function project_sequences(msa::AbstractMultipleSequenceAlignment; fracvar::Real
     return X[1:nd, :]
 end
 
-const reduced_code = ReducedAlphabet("(AILMV)(NQST)(RHK)(DE)(FWY)CGP")
-
 function _entropy(v)
     count = countitems(v)
     e = 0.0
@@ -30,16 +28,13 @@ function _entropy(v)
 end
 
 """
-    columnwise_entropy(msa::AbstractMultipleSequenceAlignment, aacode = ReducedAlphabet("(AILMV)(NQST)(RHK)(DE)(FWY)CGP"))
+    columnwise_entropy(msa, aacode = reduced_code)
 
 Compute the entropy of each column in an MSA. Low entropy indicates high conservation.
 
 Unmatched entries (`'-'` residues) contribute to the entropy calculation as if they were an ordinary residue.
 """
-function columnwise_entropy(msa::AbstractMultipleSequenceAlignment, aacode=reduced_code)
-    resnum = map(r -> aacode[r], getresidues(msa))
-    return map(_entropy, eachcol(resnum))
-end
+function columnwise_entropy end
 
 """
     residue_centroid(r::AbstractResidue)