|
| 1 | +module GPCRAnalysisMIToSExt |
| 2 | + |
| 3 | +using GPCRAnalysis |
| 4 | +using Downloads |
| 5 | +using BioStructures |
| 6 | +using ProgressMeter |
| 7 | + |
| 8 | +using GPCRAnalysis: ChainLike, ResidueLike, StructureLike, _entropy, validate_seq_residues, rex_alphafold_pdbs |
| 9 | + |
| 10 | +using MIToS: MIToS, Pfam, MSA |
| 11 | +using MIToS.MSA: AbstractMultipleSequenceAlignment, AnnotatedAlignedSequence, AnnotatedMultipleSequenceAlignment, |
| 12 | + ReducedAlphabet, ResidueAlphabet, GAP, XAA |
| 13 | +using MIToS.MSA: getsequence, getannotsequence, getsequencemapping, getresidues, three2residue, sequencenames, |
| 14 | + filtersequences, filtersequences!, percentsimilarity |
| 15 | + |
| 16 | + |
| 17 | +# Low-level API implementation |
| 18 | +GPCRAnalysis.sequenceindexes(msaseq::AnnotatedAlignedSequence) = getsequencemapping(msaseq) |
| 19 | +GPCRAnalysis.sequenceindexes(msaseq::AbstractMultipleSequenceAlignment, i::Int) = getsequencemapping(msaseq, i) |
| 20 | +GPCRAnalysis.isgap(res::MSA.Residue) = res == GAP |
| 21 | +GPCRAnalysis.isunknown(res::MSA.Residue) = res == XAA |
| 22 | +GPCRAnalysis.sequencekeys(msa::AbstractMultipleSequenceAlignment) = sequencenames(msa) |
| 23 | +GPCRAnalysis.msasequence(msa::AbstractMultipleSequenceAlignment, key) = getsequence(msa, key) |
| 24 | +GPCRAnalysis.residuematrix(msa::AbstractMultipleSequenceAlignment) = getresidues(msa) |
| 25 | +GPCRAnalysis.subseqs(msa::AbstractMultipleSequenceAlignment, rowmask::AbstractVector{Bool}) = filtersequences(msa, rowmask) |
| 26 | +GPCRAnalysis.subseqs!(msa::AbstractMultipleSequenceAlignment, rowmask::AbstractVector{Bool}) = filtersequences!(msa, rowmask) |
| 27 | +function GPCRAnalysis.subseqs(msa::AbstractMultipleSequenceAlignment, rowindexes::AbstractVector{Int}) |
| 28 | + rowmask = falses(nsequences(msa)) |
| 29 | + rowmask[rowindexes] .= true |
| 30 | + return subseqs(msa, rowmask) |
| 31 | +end |
| 32 | +function GPCRAnalysis.subseqs!(msa::AbstractMultipleSequenceAlignment, rowindexes::AbstractVector{Int}) |
| 33 | + rowmask = falses(nsequences(msa)) |
| 34 | + rowmask[rowindexes] .= true |
| 35 | + return subseqs!(msa, rowmask) |
| 36 | +end |
| 37 | +GPCRAnalysis.percent_similarity(msa::AbstractMultipleSequenceAlignment) = percentsimilarity(msa) |
| 38 | + |
| 39 | +Base.getindex(msa::AbstractMultipleSequenceAlignment, seqname::MSACode) = getsequence(msa, seqname.name) |
| 40 | +Base.getindex(msa::AbstractMultipleSequenceAlignment, seqname::AccessionCode) = getsequence(msa, MSACode(msa, seqname).name) |
| 41 | + |
| 42 | +function GPCRAnalysis.AccessionCode(msa::AnnotatedMultipleSequenceAlignment, seqname::AbstractString) |
| 43 | + AccessionCode(uniprotX(getannotsequence(msa, seqname, "AC", seqname))) |
| 44 | +end |
| 45 | +GPCRAnalysis.AccessionCode(msa::AnnotatedMultipleSequenceAlignment, seqname::MSACode) = AccessionCode(msa, seqname.name) |
| 46 | +GPCRAnalysis.AccessionCode(::AnnotatedMultipleSequenceAlignment, seqname::AccessionCode) = seqname |
| 47 | + |
| 48 | +function GPCRAnalysis.MSACode(msa::AnnotatedMultipleSequenceAlignment, accession::AbstractString) |
| 49 | + seqnames = sequencenames(msa) |
| 50 | + return MSACode(seqnames[findfirst(x -> AccessionCode(msa, x).name == accession, seqnames)]) |
| 51 | +end |
| 52 | +GPCRAnalysis.MSACode(msa::AnnotatedMultipleSequenceAlignment, accession::AccessionCode) = MSACode(msa, accession.name) |
| 53 | +GPCRAnalysis.MSACode(::AnnotatedMultipleSequenceAlignment, accession::MSACode) = accession |
| 54 | + |
| 55 | +GPCRAnalysis.SequenceMapping(seq::AnnotatedAlignedSequence) = SequenceMapping(getsequencemapping(seq)) |
| 56 | + |
| 57 | +# Move this to MIToS? |
| 58 | +if !hasmethod(getsequencemapping, Tuple{AnnotatedAlignedSequence}) |
| 59 | + function MIToS.MSA.getsequencemapping(seq::AnnotatedAlignedSequence) |
| 60 | + getsequencemapping(seq, sequencenames(seq)[1]) |
| 61 | + end |
| 62 | + function MIToS.MSA.getsequencemapping(msa::Union{AnnotatedAlignedSequence,AnnotatedMultipleSequenceAlignment}, seq_id::String) |
| 63 | + MIToS.MSA._str2int_mapping(getannotsequence(msa, seq_id, "SeqMap")) |
| 64 | + end |
| 65 | + function MIToS.MSA.getsequencemapping(msa::AnnotatedMultipleSequenceAlignment, seqid::Regex) |
| 66 | + id = findfirst(str -> occursin(seqid, str), sequencenames(msa)) |
| 67 | + getsequencemapping(msa, id) |
| 68 | + end |
| 69 | +end |
| 70 | + |
| 71 | +const reduced_code = ReducedAlphabet("(AILMV)(NQST)(RHK)(DE)(FWY)CGP") |
| 72 | + |
| 73 | +""" |
| 74 | + columnwise_entropy(msa, aacode = reduced_code) |
| 75 | +
|
| 76 | +Call `columnwise_entropy` after mapping each residue through `aacode`. |
| 77 | +
|
| 78 | +The default code is `ReducedAlphabet("(AILMV)(NQST)(RHK)(DE)(FWY)CGP")`, which |
| 79 | +groups residues into categories hydrophobic, polar, charged, aromatic, and |
| 80 | +"special." |
| 81 | +""" |
| 82 | +GPCRAnalysis.columnwise_entropy(msa::AbstractMultipleSequenceAlignment, aacode::ResidueAlphabet=reduced_code) = |
| 83 | + GPCRAnalysis.columnwise_entropy(r -> aacode[r], msa) |
| 84 | + |
| 85 | +end |
0 commit comments