1+ module GPCRAnalysisMIToSExt
2+
3+ using GPCRAnalysis
4+ using Downloads
5+ using BioStructures
6+ using ProgressMeter
7+
8+ using GPCRAnalysis: ChainLike, ResidueLike, StructureLike, _entropy, validate_seq_residues, rex_alphafold_pdbs
9+
10+ using MIToS: MIToS, Pfam, MSA
11+ using MIToS. MSA: AbstractMultipleSequenceAlignment, AnnotatedAlignedSequence, AnnotatedMultipleSequenceAlignment,
12+ ReducedAlphabet, GAP, XAA
13+ using MIToS. MSA: getsequence, getannotsequence, getsequencemapping, getresidues, three2residue, sequencenames,
14+ filtersequences!, percentsimilarity
15+
16+
17+ Base. getindex (msa:: AbstractMultipleSequenceAlignment , seqname:: MSACode ) = getsequence (msa, seqname. name)
18+ Base. getindex (msa:: AbstractMultipleSequenceAlignment , seqname:: AccessionCode ) = getsequence (msa, MSACode (msa, seqname). name)
19+
20+ MIToS. PDB. ishydrophobic (a:: AbstractAtom , rname:: AbstractString ) = (rname, atomname (a)) in MIToS. PDB. _hydrophobic
21+ MIToS. PDB. isaromatic (a:: AbstractAtom , rname:: AbstractString ) = (rname, atomname (a)) in MIToS. PDB. _aromatic
22+ MIToS. PDB. iscationic (a:: AbstractAtom , rname:: AbstractString ) = (rname, atomname (a)) in MIToS. PDB. _cationic
23+ MIToS. PDB. isanionic (a:: AbstractAtom , rname:: AbstractString ) = (rname, atomname (a)) in MIToS. PDB. _anionic
24+ MIToS. PDB. ishbonddonor (a:: AbstractAtom , rname:: AbstractString ) = (rname, atomname (a)) in keys (MIToS. PDB. _hbond_donor)
25+ MIToS. PDB. ishbondacceptor (a:: AbstractAtom , rname:: AbstractString ) = (rname, atomname (a)) in keys (MIToS. PDB. _hbond_acceptor)
26+
27+ function GPCRAnalysis. AccessionCode (msa:: AnnotatedMultipleSequenceAlignment , seqname:: AbstractString )
28+ AccessionCode (uniprotX (getannotsequence (msa, seqname, " AC" , seqname)))
29+ end
30+ GPCRAnalysis. AccessionCode (msa:: AnnotatedMultipleSequenceAlignment , seqname:: MSACode ) = AccessionCode (msa, seqname. name)
31+ GPCRAnalysis. AccessionCode (:: AnnotatedMultipleSequenceAlignment , seqname:: AccessionCode ) = seqname
32+
33+ function GPCRAnalysis. MSACode (msa:: AnnotatedMultipleSequenceAlignment , accession:: AbstractString )
34+ seqnames = sequencenames (msa)
35+ return MSACode (seqnames[findfirst (x -> AccessionCode (msa, x). name == accession, seqnames)])
36+ end
37+ GPCRAnalysis. MSACode (msa:: AnnotatedMultipleSequenceAlignment , accession:: AccessionCode ) = MSACode (msa, accession. name)
38+ GPCRAnalysis. MSACode (:: AnnotatedMultipleSequenceAlignment , accession:: MSACode ) = accession
39+
40+ GPCRAnalysis. SequenceMapping (seq:: AnnotatedAlignedSequence ) = SequenceMapping (getsequencemapping (seq))
41+
42+ GPCRAnalysis. percent_similarity (msa:: AbstractMultipleSequenceAlignment ) = percentsimilarity (msa)
43+
44+ # Move this to MIToS?
45+ if ! hasmethod (getsequencemapping, Tuple{AnnotatedAlignedSequence})
46+ function MIToS. MSA. getsequencemapping (seq:: AnnotatedAlignedSequence )
47+ getsequencemapping (seq, sequencenames (seq)[1 ])
48+ end
49+ function MIToS. MSA. getsequencemapping (msa:: Union{AnnotatedAlignedSequence,AnnotatedMultipleSequenceAlignment} , seq_id:: String )
50+ MIToS. MSA. _str2int_mapping (getannotsequence (msa, seq_id, " SeqMap" ))
51+ end
52+ function MIToS. MSA. getsequencemapping (msa:: AnnotatedMultipleSequenceAlignment , seqid:: Regex )
53+ id = findfirst (str -> occursin (seqid, str), sequencenames (msa))
54+ getsequencemapping (msa, id)
55+ end
56+ end
57+
58+ function GPCRAnalysis. validate_seq_residues (seq:: AnnotatedAlignedSequence , chain)
59+ for (i, r) in zip (getsequencemapping (seq), seq)
60+ (r == GAP || r == XAA) && continue
61+ res = three2residue (String (resname (chain[i])))
62+ res == r || return false
63+ end
64+ return true
65+ end
66+
67+ const reduced_code = ReducedAlphabet (" (AILMV)(NQST)(RHK)(DE)(FWY)CGP" )
68+
69+ function GPCRAnalysis. columnwise_entropy (msa:: AbstractMultipleSequenceAlignment , aacode= reduced_code)
70+ resnum = map (r -> aacode[r], getresidues (msa))
71+ return map (_entropy, eachcol (resnum))
72+ end
73+
74+ function GPCRAnalysis. filter_long! (msa:: AbstractMultipleSequenceAlignment , minres:: Real )
75+ # Get rid of short sequences
76+ nresidues = map (eachrow (msa)) do v
77+ sum (!= (MSA. Residue (' -' )), v)
78+ end
79+ mask = nresidues .> minres
80+ filtersequences! (msa, mask)
81+ end
82+
83+ function GPCRAnalysis. aa_properties_matrix (msa:: AbstractMultipleSequenceAlignment )
84+ props = copy (aa_properties_zscored)
85+ props[Char (GAP)] = zero (valtype (props))
86+ props[' X' ] = zero (valtype (props))
87+ return [props[Char (residue)] for residue in permutedims (msa)]
88+ end
89+
90+ function GPCRAnalysis. chimerax_script (scriptfilename, uprot_list, msa:: AnnotatedMultipleSequenceAlignment , colidxs;
91+ dir= pwd (), styles= Dict {Int,String} (), kwargs... )
92+ ridxs = [Int[] for _ in 1 : length (uprot_list)]
93+ struct_filenames = Vector {String} (undef, length (uprot_list))
94+ rcstyles = Dict {Tuple{Int,Int},String} ()
95+ afs = alphafoldfiles (msa, dir; join= true )
96+ uprot2msaidx = Dict {AccessionCode,Int} (AccessionCode (msa, name) => i for (i, name) in enumerate (sequencenames (msa)))
97+ for (i, p) in enumerate (uprot_list)
98+ j = uprot2msaidx[AccessionCode (p)]
99+ struct_filenames[i] = afs[MSACode (sequencenames (msa)[j])]
100+ sm = getsequencemapping (msa, j)
101+ for (j, c) in enumerate (colidxs)
102+ ridx = sm[c]
103+ if iszero (ridx)
104+ @warn " column $c not set in $p "
105+ continue
106+ end
107+ push! (ridxs[i], ridx)
108+ style = get (styles, c, nothing )
109+ if style != = nothing
110+ rcstyles[(i, j)] = style
111+ end
112+ end
113+ end
114+ return chimerax_script (scriptfilename, struct_filenames, ridxs; styles= rcstyles, kwargs... )
115+ end
116+
117+ function GPCRAnalysis. filter_species! (msa:: AbstractMultipleSequenceAlignment , speciesname:: AbstractString )
118+ mask = map (x -> species (x) == speciesname, sequencenames (msa))
119+ filtersequences! (msa, mask)
120+ end
121+
122+ GPCRAnalysis. gapres (:: Type{MSA.Residue} ) = MSA. Residue (' -' )
123+
124+ function GPCRAnalysis. StructAlign (struct1:: ChainLike , struct2:: ChainLike ,
125+ align1:: AbstractVector{MSA.Residue} , align2:: AbstractVector{MSA.Residue} ,
126+ quality)
127+ StructAlign (MapAlign (struct1, align1, quality), MapAlign (struct2, align2, quality))
128+ end
129+
130+ """
131+ msacode2structfile = alphafoldfiles(msa::AnnotatedMultipleSequenceAlignment, dirname=pwd())
132+
133+ Return a dictionary mapping `MSACode`s to the corresponding AlphaFold structure files.
134+ """
135+ function GPCRAnalysis. alphafoldfiles (msa:: AnnotatedMultipleSequenceAlignment , dirname= pwd (); join:: Bool = false )
136+ afs = alphafoldfiles (dirname)
137+ accesscode2idx = Dict {AccessionCode,Int} ()
138+ for (i, af) in pairs (afs)
139+ ac = AccessionCode (match (rex_alphafold_pdbs, af). captures[1 ])
140+ accesscode2idx[ac] = i
141+ end
142+ msacode2structfile = Dict {MSACode,String} ()
143+ for name in sequencenames (msa)
144+ ac = AccessionCode (msa, name)
145+ if haskey (accesscode2idx, ac)
146+ fn = afs[accesscode2idx[ac]]
147+ msacode2structfile[MSACode (name)] = join ? joinpath (dirname, fn) : fn
148+ end
149+ end
150+ return msacode2structfile
151+ end
152+
153+ function GPCRAnalysis. download_alphafolds (msa:: AbstractMultipleSequenceAlignment ; dirname= pwd (), maxversion= nothing , kwargs... )
154+ maxversion === nothing || @warn " `download_alphafolds`: `maxversion` kwarg has no effect and is deprecated" maxlog= 1
155+ @showprogress 1 " Downloading AlphaFold files..." for name in sequencenames (msa)
156+ uname = AccessionCode (msa, name)
157+ url = query_alphafold_latest (uname)
158+ url === nothing && continue
159+ fn = split (url, ' /' )[end ]
160+ path = joinpath (dirname, fn)
161+ if ! isfile (path)
162+ Downloads. download (url, path)
163+ end
164+ if ! validate_seq_residues (getsequence (msa, name), getchain (path))
165+ @warn " Residues in $path do not match those in the sequence $name , removing PDB file"
166+ rm (path)
167+ end
168+ end
169+ end
170+
171+ end
0 commit comments