MC-Annotate-parser/parse_MCAnnotate.pl at master · AntonPetrov/MC-Annotate-parser · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
#!/usr/bin/perl -w

    # http://rna.bgsu.edu/FR3D/AnalyzedStructures/1J5E/1J5E_stacking.html
    # interactions are listed only in one direction - fixed.
    # insertion codes are present, but alternate ids are missing
    # for stacking only positions in chains are given - A1040
    # check that all stacking correspondences match up - done. Used 1J5E_A.
    # Had to replace named capture groups with $1 etc because perl on the server is at v5.8

    # Alternative implementation: java parser written by Jose (Dumontier's lab);
    # http://code.google.com/p/semanticscience/source/browse/branches/jose/java/MC-Annotator/trunk/MC-Annotator/src/main/java/com/dumontierlab/mcannotator/bin/ParseMCAnnotate.java?r=677

    # pdb_id,pdb_type,NT.ModelNum,NT.Chain,NT.Number,NT.Base,insCode,alternateId
    # does not recognize alternate ids - tested on 1DK1 (residue 27). Verify that it's always A. How?

use Switch;

if ( scalar(@ARGV) < 1 ) {
	die( "Input: text file created by MC-Annotate\n" );
}
chomp(@ARGV);
open( IN,  '<', $ARGV[0] ) or die("Could not open $ARGV[0].\n");

if ( $ARGV[0] !~ /([A-z0-9]{4}) # $1, pdbId
                  _
                  (A|\d+)       # $2, A for Asymmetric unit, numbers for biological assemblies
                  /x ) {
    die('Check the input filename');
}
$pdbId = $1;
if ( $2 eq 'A' ) {
    $pdbType = 'AU';
} else {
    $pdbType = "BA$2";
}
print $pdbId , '_' , $pdbType , "\n";

$outputFolder = 'MC_csv';
unless ( -d $outputFolder ) {
    mkdir($outputFolder);
}
$bps = $outputFolder . '/MC-bps_' . $pdbId . '_' . $pdbType . '.csv';
$bst = $outputFolder . '/MC-bst_' . $pdbId . '_' . $pdbType . '.csv';
if (-e $bps and -e $bst) {
    print "File $ARGV[0] already processed\n";
    exit;
}

open( BPS, '>', $bps ) or die("Could not open $bps");
open( BST, '>', $bst ) or die("Could not open $bst");
#open( NBPS,'>', 'MC-Annotate_nearBasePairs.csv' ) or die('Could not open MC-Annotate_nearBasePairs.csv');

$model = -1;
$/ = "Residue conformations";
while ( $record = <IN> ) {

    $model++;
    # skip the first block because it's always empty
    if ( $model == 0 ) {
        next;
    }
    print "Model $model\n";

    @lines = split("\n", $record);

    %ntMap = ();
    foreach $line (@lines) {

        # Parse residue conformations and store residue names
        # A1003 : G C3p_endo anti OR A1003.A : G C3p_endo anti
        if ( $line =~ /^((\w|'\d')(\d+)(\.\w)?) # $1-mcId,$2-chain,$3-number,$4-insCode
                       \s:\s
                       (\w)\s        # $5-residue
                       (?:\w+)_      # $6-puckerAtom
                       (?:\w+)\s     # $7-puckerQual
                       (?:\w+)       # $8-conf
                       /x ) {

            $mcId = $1;
            $res  = $5;
            $mcId =~ s/('|\.)//g; #'
            $ntMap{$mcId} = $res;
            # store the conformations if necessary
        }


        # Parse base pairs
        # A419-A424 : C-G Ww/Ww pairing antiparallel cis XIX OR A1030.A-A1030.C : G-G O2'/Hh Ss/O2P pairing
        elsif ( $line =~ /^(\w|'\d') # $1-ch1
                           (\d+)     # $2-res1
                           (\.\w)?-  # $3-insCode1
                           (\w|'\d') # $4-ch2
                           (\d+)     # $5-res2
                           (\.\w)?   # $6-insCode2
                           \s:\s
                           (\w)-     # $7-base1
                           (\w)\s    # $8-base2
                           ((?:w|h|s){2}\/(?:w|h|s){2}) # $9-MCpair
                           (?:.*?)pairing\s             # additional descriptions
                           (antiparallel|parallel)\s    # $10-orientation
                           (cis|trans)                  # $11-cis,trans
                           /ix ) {

            $ch1  = $1; $ch2 = $4;
            $res1 = $2; $res2 = $5;
            $insCode1 = ( defined($3) ) ? substr($3,1,1) : '';
            $insCode2 = ( defined($6) ) ? substr($6,1,1) : '';
            $base1 = $7; $base2= $8;
            $MCpair = $9;
            $cistrans = $11;
            $ch1 =~ s/'//g;
            $ch2 =~ s/'//g;

            # pdb_id,pdb_type,NT.ModelNum,NT.Chain,NT.Number,NT.Base,insCode,alternateId
            $ntId1 = join('_', $pdbId,$pdbType,$model,$ch1,$res1,$base1,$insCode1);
            $ntId2 = join('_', $pdbId,$pdbType,$model,$ch2,$res2,$base2,$insCode2);

            $LWpair  = substr($cistrans,0,1) . substr($MCpair,0,1) . substr($MCpair,3,1);
            $rLWpair = substr($cistrans,0,1) . substr($MCpair,3,1) . substr($MCpair,0,1);
            $rMCpair = substr($MCpair,3,2) . '/' . substr($MCpair,0,2);

            print BPS join("\t",$ntId1,$LWpair,$MCpair,$ntId2) , "\n";
            print BPS join("\t",$ntId2,$rLWpair,$rMCpair,$ntId1) , "\n";

        }


        # Pares other MC-Annotate pairs
        # A1266-A1268 : G-A O2'/Hh Hh/O2P pairing OR A1281-A1282 : U-C O2P/Bh adjacent_5p pairing
        elsif ( $line =~ /^(\w|'\d') # $1-ch1
                           (\d+)     # $2-res1
                           (\.\w)?-  # $3-insCode1
                           (\w|'\d') # $4-ch2
                           (\d+)     # $5-res2
                           (\.\w)?   # $6-insCode2
                           \s:\s
                           (\w)-     # $7-base1
                           (\w)\s    # $8-base2
                           ((?:(?:\w|'){2,3}\/(?:\w|'){2,3}\s)+)  # $9-nearMCpair
                           (?:adjacent_.p){0,1}   # don't include adjacency
                           (?:\s.{2,4}ward){0,1}  # don't include stacking
                           (?:\spairing){0,1}
                           /x ) {

            $ch1  = $1; $ch2 = $4;
            $res1 = $2; $res2 = $5;
            $insCode1 = ( defined($3) ) ? substr($3,1,1) : '';
            $insCode2 = ( defined($6) ) ? substr($6,1,1) : '';
            $base1 = $7; $base2= $8;
            $nearMCpair = $9;
            $ch1 =~ s/'//g;
            $ch2 =~ s/'//g;

            # pdb_id,pdb_type,NT.ModelNum,NT.Chain,NT.Number,NT.Base,insCode,alternateId
            $ntId1 = join('_', $pdbId,$pdbType,$model,$ch1,$res1,$base1,$insCode1);
            $ntId2 = join('_', $pdbId,$pdbType,$model,$ch2,$res2,$base2,$insCode2);

            $rNearMCpair = '';
            @pairs = split(/\s/,$nearMCpair);
            foreach $pair (@pairs) {
                $pair =~ m/(.+)\/(.+)/;
                $rNearMCpair .= $2 . '/' . $1  . ' ';
            }
            $rNearMCpair =~ s/\s+$//;

            $LWcompatible = '';

            print BPS join("\t",$ntId1,$LWcompatible,$nearMCpair,$ntId2) , "\n";
            print BPS join("\t",$ntId2,$LWcompatible,$rNearMCpair,$ntId1) , "\n";

        }


        # Parse base stacking
        # adjacent base stacking:
        # A1028-A1029 : adjacent_5p upward OR A1030.B-A1030.C : adjacent_5p upward
        # non-adjacent stacking:
        # A1346-A1348 : outward OR A1347-A1373 : inward pairing
        elsif ( $line =~ /^(\w|'\d')(\d+)(\.\w)?- # $1-ch1,$2-res1,$3-insCode1
                           (\w|'\d')(\d+)(\.\w)?  # $4-ch2,$5-res2,$6-insCode2
                           \s:\s
                           (?:\w+\s)?
                           (outward|inward|downward|upward) # $7-Xward
                           /x ) {

            $ch1 = $1;  $ch2 = $4;
            $res1 = $2; $res2 = $5;
            $insCode1 = ( defined($3) ) ? substr($3,1,1) : '';
            $insCode2 = ( defined($6) ) ? substr($6,1,1) : '';
            $Xward = $7;
            $ch1 =~ s/'//g;
            $ch2 =~ s/'//g;

            $mcId1 = $ch1 . $res1 . $insCode1;
            $mcId2 = $ch2 . $res2 . $insCode2;

            # upward=s35, downward=s53, inward=s33, outward=s55
            switch ($Xward) {
            	case "upward"	{ $LWstack = 's35'; $rLWstack = 's53'; $rXward = 'downward'; }
            	case "downward"	{ $LWstack = 's53'; $rLWstack = 's35'; $rXward = 'upward'; }
            	case "inward"	{ $LWstack = 's33'; $rLWstack = 's33'; $rXward = 'inward'; }
            	case "outward"	{ $LWstack = 's55'; $rLWstack = 's55'; $rXward = 'outward'; }
            }

            # pdb_id,pdb_type,NT.ModelNum,NT.Chain,NT.Number,NT.Base,insCode,alternateId
            $ntId1 = join('_',$pdbId,$pdbType,$model,$ch1,$res1,$ntMap{$mcId1},$insCode1);
            $ntId2 = join('_',$pdbId,$pdbType,$model,$ch2,$res2,$ntMap{$mcId2},$insCode2);
            print BST join("\t",$ntId1,$LWstack,$Xward,$ntId2), "\n";
            print BST join("\t",$ntId2,$rLWstack,$rXward,$ntId1), "\n";

        }

        else {
            # inspect the lines that were not parsed
            if ( length($line) > 15 and $line !~ /-{2,}/ ) {
                print $line , "\n";
            }
            next;
        }

    }

    print "++++++++++++++++++\n";

}

close(IN);
close(BPS);
close(BST);
#close(NBPS);