HolyLab
diff --git a/‎Project.toml‎
Lines changed: 1 addition & 1 deletion b/‎Project.toml‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎ext/GPCRAnalysisBioStockholmExt.jl‎
Lines changed: 33 additions & 53 deletions b/‎ext/GPCRAnalysisBioStockholmExt.jl‎
Lines changed: 33 additions & 53 deletions
diff --git a/‎ext/GPCRAnalysisMIToSExt.jl‎
Lines changed: 5 additions & 2 deletions b/‎ext/GPCRAnalysisMIToSExt.jl‎
Lines changed: 5 additions & 2 deletions
diff --git a/‎src/GPCRAnalysis.jl‎
Lines changed: 2 additions & 1 deletion b/‎src/GPCRAnalysis.jl‎
Lines changed: 2 additions & 1 deletion
diff --git a/‎src/alphafold.jl‎
Lines changed: 8 additions & 6 deletions b/‎src/alphafold.jl‎
Lines changed: 8 additions & 6 deletions
diff --git a/‎src/analyze.jl‎
Lines changed: 0 additions & 2 deletions b/‎src/analyze.jl‎
Lines changed: 0 additions & 2 deletions
diff --git a/‎src/msa.jl‎
Lines changed: 107 additions & 1 deletion b/‎src/msa.jl‎
Lines changed: 107 additions & 1 deletion
@@ -1,6 +1,6 @@
 name = "GPCRAnalysis"
 uuid = "c1d73f9e-d42a-418a-8d5b-c7b00ec0358f"
-version = "0.6.0"
+version = "0.6.1"
 authors = ["Tim Holy <[email protected]> and contributors"]
 
 [deps]
 
@@ -5,17 +5,45 @@ using BioStockholm
 using BioStockholm: OrderedDict   # from OrderedCollections.jl
 
 function conscols(msa::MSA)
-    ss = msa.GC["SS_cons"]
-    return findfirst(!=( '.'), ss):findlast(!=( '.'), ss)
+    if length(msa.GR) == 1
+        # Fast-path: use the reference sequence
+        key, _ = only(msa.GR)
+        s = msa.seq[key]
+        return findall(s) do c
+            c == '-' || isuppercase(c)
+        end
+    end
+    # Slow path: check each sequence, find all that have at least one uppercase in that column
+    keep = falses(length(msa.GC["seq_cons"]))
+    for (_, s) in msa.seq
+        keep .|= isuppercase.(s)
+    end
+    return findall(keep)
 end
 
 # Low-level API implementation
-# GPCRAnalysis.sequenceindexes(msaseq::AnnotatedAlignedSequence) = getsequencemapping(msaseq)
-# GPCRAnalysis.sequenceindexes(msaseq::MSA, i::Int) = getsequencemapping(msaseq, i)
+GPCRAnalysis.sequenceindexes(msa::MSA, i::Int) = GPCRAnalysis.sequenceindexes(msa::MSA, MSACode(GPCRAnalysis.sequencekeys(msa)[i]))
+function GPCRAnalysis.sequenceindexes(msa::MSA, key::MSACode)
+    # seq = GPCRAnalysis.msasequence(msa, key)
+    seq = msa.seq[String(key)]
+    offset = findfirst(!=('.'), seq)
+    filled = [r != '-' for r in seq]
+    cf = cumsum(filled)
+    keepcols = conscols(msa)
+    m = match(r"/(\d+)-(\d+)$", String(key))
+    if m !== nothing
+        start, stop = parse.(Int, m.captures)
+        Δ = start - offset
+        return (filled .* (cf .+ Δ))[keepcols]
+    end
+    return (filled .* cf)[keepcols]
+end
 GPCRAnalysis.sequencekeys(msa::MSA) = collect(keys(msa.seq))
-GPCRAnalysis.msasequence(msa::MSA, key) = msa.seq[key][conscols(msa)]
+GPCRAnalysis.msasequence(msa::MSA, key::MSACode) = msa.seq[String(key)][conscols(msa)]
+GPCRAnalysis.msasequence(msa::MSA, key::AbstractString) = GPCRAnalysis.msasequence(msa, MSACode(key))
 function GPCRAnalysis.residuematrix(msa::MSA)
     keepcols = conscols(msa)
+    # keepcols = Colon()
     reduce(vcat, [permutedims(seq[keepcols]) for (_, seq) in msa.seq])
 end
 GPCRAnalysis.subseqs(msa::MSA{T}, rowmask::AbstractVector{Bool}) where T = MSA{T}(OrderedDict(pr for (pr, keep) in zip(msa.seq, rowmask) if keep), msa.GF, OrderedDict(pr for (pr, keep) in zip(msa.GS, rowmask) if keep), msa.GC, msa.GR)
@@ -48,52 +76,4 @@ end
 GPCRAnalysis.MSACode(msa::MSA, accession::AccessionCode) = MSACode(msa, accession.name)
 GPCRAnalysis.MSACode(::MSA, accession::MSACode) = accession
 
-
-function reduced_alphabet(r::Char)
-    if r == '-'
-        return 0
-    elseif r in ('A','I','L','M','V')
-        return 1  # hydrophobic
-    elseif r in ('N','Q','S','T')
-        return 2  # polar
-    elseif r in ('R','H','K')
-        return 3  # charged
-    elseif r in ('D','E')
-        return 4  # charged
-    elseif r in ('F','W','Y')
-        return 5  # aromatic
-    end
-    offset = findfirst(==(r), ('C','G','P'))
-    offset === nothing && throw(ArgumentError("Unknown residue '$r'"))
-    return 5 + offset  # special or unknown
-end
-
-GPCRAnalysis.columnwise_entropy(msa) = columnwise_entropy(reduced_alphabet, msa)
-
-function GPCRAnalysis.percent_similarity(f, msa::MSA)
-    # This mimics MIToS's implementation
-    function pctsim(v1, v2)
-        same = l = 0
-        for (a, b) in zip(v1, v2)
-            a == b == 0 && continue  # skip gaps
-            same += a == b
-            l += 1
-        end
-        return 100 * same / l
-    end
-
-    M = f.(GPCRAnalysis.residuematrix(msa))
-    n = size(M, 1)
-    S = zeros(Float64, n, n)
-    for i in 1:n
-        for j in i:n
-            S[i, j] = pctsim(M[i, :], M[j, :])
-            S[j, i] = S[i, j]
-        end
-    end
-    return S
-end
-GPCRAnalysis.percent_similarity(msa::MSA) = GPCRAnalysis.percent_similarity(reduced_alphabet, msa)
-
-
 end
@@ -16,11 +16,14 @@ using MIToS.MSA: getsequence, getannotsequence, getsequencemapping, getresidues,
 
 # Low-level API implementation
 GPCRAnalysis.sequenceindexes(msaseq::AnnotatedAlignedSequence) = getsequencemapping(msaseq)
-GPCRAnalysis.sequenceindexes(msaseq::AbstractMultipleSequenceAlignment, i::Int) = getsequencemapping(msaseq, i)
+GPCRAnalysis.sequenceindexes(msa::AbstractMultipleSequenceAlignment, i::Int) = getsequencemapping(msa, i)
+GPCRAnalysis.sequenceindexes(msa::AbstractMultipleSequenceAlignment, key::AbstractString) = getsequencemapping(msa, key)
+GPCRAnalysis.sequenceindexes(msa::AbstractMultipleSequenceAlignment, key::MSACode) = sequenceindexes(msa, String(key))
 GPCRAnalysis.isgap(res::MSA.Residue) = res == GAP
 GPCRAnalysis.isunknown(res::MSA.Residue) = res == XAA
 GPCRAnalysis.sequencekeys(msa::AbstractMultipleSequenceAlignment) = sequencenames(msa)
-GPCRAnalysis.msasequence(msa::AbstractMultipleSequenceAlignment, key) = getsequence(msa, key)
+GPCRAnalysis.msasequence(msa::AbstractMultipleSequenceAlignment, key::AbstractString) = getsequence(msa, key)
+GPCRAnalysis.msasequence(msa::AbstractMultipleSequenceAlignment, key::MSACode) = msasequence(msa, String(key))
 GPCRAnalysis.residuematrix(msa::AbstractMultipleSequenceAlignment) = getresidues(msa)
 GPCRAnalysis.subseqs(msa::AbstractMultipleSequenceAlignment, rowmask)  = filtersequences(msa, rowmask)
 GPCRAnalysis.subseqs!(msa::AbstractMultipleSequenceAlignment, rowmask) = filtersequences!(msa, rowmask)
 
@@ -37,9 +37,10 @@ const StructureLike = Union{ChainLike, Model, MolecularStructure}
 # export @res_str
 
 export SequenceMapping, AccessionCode, MSACode, NWGapCosts
+export sequenceindexes, columnindexes, isgap, isunknown, sequencekeys, msasequence, residuematrix, subseqs, subseqs!
 export species, uniprotX, query_uniprot_accession, query_ebi_proteins, query_ncbi
 export try_download_alphafold, query_alphafold_latest, download_alphafolds, alphafoldfile, alphafoldfiles, getchain,
-       findall_subseq, pLDDT, pLDDTcolor
+       writechain, findall_subseq, pLDDT, pLDDTcolor
 export align_to_axes, align_to_membrane, align_nw, align_ranges, map_closest, align_closest
 export filter_species!, filter_long!, sortperm_msa, chimerax_script
 export project_sequences, columnwise_entropy, align, residue_centroid, residue_centroid_matrix, alphacarbon_coordinates,
 
@@ -1,5 +1,5 @@
 # Generates 2 captures, one for the uniprotXname and the other for the version
-const rex_alphafold_pdbs = r"AF-([OPQ][0-9][A-Z0-9]{3}[0-9]|[A-NR-Z][0-9](?:[A-Z][A-Z0-9]{2}[0-9]){1,2})-F1-model_v(\d+).(?:pdb|cif|bcif)"
+const rex_alphafold_pdbs = r"AF-([OPQ][0-9][A-Z0-9]{3}[0-9]|[A-NR-Z][0-9](?:[A-Z][A-Z0-9]{2}[0-9]){1,2})-F1-model_v(\d+)(?:_[A-Z])?(?:\.(?:pdb|cif|bcif))?"
 # Make a regex for a specific uniprotXname (single capture for the version)
 regex_alphafold_pdb(uniprotXname) = Regex("AF-$uniprotXname-F1-model_v(\\d+).(?:pdb|cif|bcif)")
 
@@ -62,12 +62,14 @@ function alphafoldfiles(msa, dirname::AbstractString=pwd(); join::Bool=false)
         ac = AccessionCode(match(rex_alphafold_pdbs, af).captures[1])
         accesscode2idx[ac] = i
     end
-    msacode2structfile = Dict{MSACode,String}()
+    K = typeof(first(sequencekeys(msa)))
+    msacode2structfile = Dict{K<:AbstractString ? MSACode : K,String}()
     for name in sequencekeys(msa)
         ac = AccessionCode(msa, name)
         if haskey(accesscode2idx, ac)
             fn = afs[accesscode2idx[ac]]
-            msacode2structfile[MSACode(name)] = join ? joinpath(dirname, fn) : fn
+            mc = isa(name, AbstractString) ? MSACode(name) : name
+            msacode2structfile[mc] = join ? joinpath(dirname, fn) : fn
         end
     end
     return msacode2structfile
@@ -130,23 +132,23 @@ function download_alphafolds(msa; dirname=pwd(), maxversion=nothing, kwargs...)
     maxversion === nothing || @warn "`download_alphafolds`: `maxversion` kwarg has no effect and is deprecated" maxlog=1
     @showprogress 1 "Downloading AlphaFold files..." for name in sequencekeys(msa)
         uname = AccessionCode(msa, name)
-        url = query_alphafold_latest(uname)
+        url = query_alphafold_latest(uname; kwargs...)
         url === nothing && continue
         fn = split(url, '/')[end]
         path = joinpath(dirname, fn)
         if !isfile(path)
             Downloads.download(url, path)
         end
         if !validate_seq_residues(msasequence(msa, name), getchain(path))
-            @warn "Residues in $path do not match those in the sequence $name, removing PDB file"
+            @warn "Residues in $path do not match those in the sequence $name, removing structure file"
             rm(path)
         end
     end
 end
 
 function download_alphafolds(ids::AbstractVector{<:AbstractString}; dirname=pwd(), kwargs...)
     @showprogress 1 "Downloading AlphaFold files..." for uname in ids
-        url = query_alphafold_latest(uname)
+        url = query_alphafold_latest(uname; kwargs...)
         url === nothing && continue
         fn = split(url, '/')[end]
         path = joinpath(dirname, fn)
 
@@ -31,8 +31,6 @@ end
     columnwise_entropy(f, msa)
 
 Compute the entropy of each column in an MSA, after applying `f` to each residue. Low entropy indicates high conservation.
-
-Unmatched entries (`'-'` residues) contribute to the entropy calculation as if they were an ordinary residue.
 """
 function columnwise_entropy(f, msa)
     resnum = map(f, residuematrix(msa))
 
@@ -13,13 +13,42 @@ Return the corresponding index within the full sequence for each position in `ms
 The two-argument form retrieves the sequenceindexes for the `i`th sequence in `msa`.
 """
 function sequenceindexes end
+sequenceindexes(msa::AbstractVector{FASTX.FASTA.Record}, i::Int) = sequenceindexes(msa[i], columnindexes(msa))
+function sequenceindexes(seq::FASTX.FASTA.Record, keepcols::AbstractVector{Int})
+    s = collect(sequence(seq))
+    idx = findfirst(islowercase, s)
+    offset = idx === nothing ? first(keepcols) : idx
+    preambleidx = first(keepcols)
+    filled = map(eachindex(s)) do j
+        j < preambleidx || isuppercase(s[j])
+    end
+    cf = cumsum(filled)
+    m = match(r"/(\d+)-(\d+)$", identifier(seq))
+    if m !== nothing
+        start, stop = parse.(Int, m.captures)
+        Δ = start - offset
+        return (filled .* (cf .+ Δ))[keepcols]
+    end
+    return (filled .* cf)[keepcols]
+end
 
 """
     idxs = columnindexes(msa)
 
-Return the indices (within the reference sequence) covered by the conserved columns of the MSA.
+Return the indices of the conserved columns of the MSA.
 """
 function columnindexes end
+function columnindexes(msa::AbstractVector{FASTX.FASTA.Record})
+    # Slow path: check each sequence, find all that have at least one uppercase in that column
+    nseq = length(msa)
+    ncol = length(sequence(msa[1]))
+    keep = falses(ncol)
+    for rec in msa
+        s = collect(sequence(rec))
+        keep .|= isuppercase.(s)
+    end
+    return findall(keep)
+end
 
 """
     isgap(res)
@@ -28,6 +57,7 @@ Return `true` if the residue `res` is a gap.
 """
 function isgap end
 isgap(c::Char) = c == '-'
+isgap(r::Integer) = iszero(r)   # when mapped to integers, gaps are encoded as 0
 
 """
     isunknown(res)
@@ -43,20 +73,26 @@ isunknown(c::Char) = c == 'X'
 Return the keys (sequence names) of the MSA.
 """
 function sequencekeys end
+sequencekeys(msa::AbstractVector{FASTX.FASTA.Record}) = eachindex(msa)
 
 """
     seq = msasequence(msa, key)
 
 Return the aligned sequence corresponding to `key`.
 """
 function msasequence end
+msasequence(msa::AbstractVector{FASTX.FASTA.Record}, key::Int) = sequence(msa[key])
 
 """
     R = residuematrix(msa)
 
 Get all residues in the MSA as a matrix, one sequence per row.
 """
 function residuematrix end
+function residuematrix(msa::AbstractVector{FASTX.FASTA.Record})
+    M = reduce(vcat, [permutedims(collect(sequence(rec))) for rec in msa])
+    return M[:, columnindexes(msa)]
+end
 
 """
     msaview = subseqs(msa, rowindexes::AbstractVector{Int})
@@ -70,14 +106,80 @@ Construct a reduced-size `msaview`, keeping only the sequences corresponding to
 function subseqs end
 function subseqs! end
 
+subseqs(msa::AbstractVector{FASTX.FASTA.Record}, rowmask) = msa[rowmask]
+subseqs!(msa::AbstractVector{FASTX.FASTA.Record}, rowmask::AbstractVector{Bool}) =
+    deleteat!(msa, findall(!, rowmask))
+subseqs!(msa::AbstractVector{FASTX.FASTA.Record}, rowindexes::AbstractVector{Int}) =
+    deleteat!(msa, setdiff(1:length(msa), rowindexes))
+
+## End required API, but some can specialize other methods
+
 """
     pc = percent_similarity(msa)
+    pc = percent_similarity(f, msa)
 
 Compute the percent similarity between all pairs of sequences in `msa`.
 `pc[i, j]` is the percent similarity between sequences `i` and `j`.
+
+Optionally apply mapping function `f` to each residue before computing
+similarity.
 """
 function percent_similarity end
 
+function percent_similarity(f, msa)
+    # This mimics MIToS's implementation
+    function pctsim(v1, v2)
+        same = l = 0
+        for (a, b) in zip(v1, v2)
+            isgap(a) && isgap(b) && continue  # skip gaps
+            same += a == b
+            l += 1
+        end
+        return 100 * same / l
+    end
+
+    M = f.(residuematrix(msa))
+    n = size(M, 1)
+    S = zeros(Float64, n, n)
+    for i in 1:n
+        for j in i:n
+            S[i, j] = pctsim(M[i, :], M[j, :])
+            S[j, i] = S[i, j]
+        end
+    end
+    return S
+end
+percent_similarity(msa) = percent_similarity(reduced_alphabet, msa)
+
+function reduced_alphabet(r::Char)
+    if r == '-'
+        return 0
+    elseif r in ('A','I','L','M','V')
+        return 1  # hydrophobic
+    elseif r in ('N','Q','S','T')
+        return 2  # polar
+    elseif r in ('R','H','K')
+        return 3  # charged
+    elseif r in ('D','E')
+        return 4  # charged
+    elseif r in ('F','W','Y')
+        return 5  # aromatic
+    end
+    offset = findfirst(==(r), ('C','G','P'))
+    offset === nothing && throw(ArgumentError("Unknown residue '$r'"))
+    return 5 + offset  # special or unknown
+end
+
+columnwise_entropy(msa) = columnwise_entropy(reduced_alphabet, msa)
+
+# Notes on interpreting letter codes in the "GC.seq_cons" field:
+# - `.` indicates a gap
+# - uppercase single-letter amino acid codes indicate strong consensus (>60%)
+# - lowercase single-letter codes (likely interpretations):
+#   + 'a': aromatic
+#   + 'h': hydrophobic
+#   + rest unknown
+# - '+' and '-' indicate positively- and negatively-charged residues, respectively
 
 ## MSA functions
 
@@ -107,6 +209,10 @@ function filter_species!(msa, speciesname::AbstractString)
     mask = map(x -> species(x) == speciesname, sequencekeys(msa))
     subseqs!(msa, mask)
 end
+function filter_species!(msa::AbstractVector{FASTX.FASTA.Record}, speciesname::AbstractString)
+    mask = map(x -> species(x) == speciesname, identifier.(msa))
+    subseqs!(msa, mask)
+end
 
 """
     filter_long!(msa, minres::Real)