@@ -51,7 +51,7 @@ pick_by_id del_by_id find_by_re
5151pick_by_re del_by_re
5252pick_by_file del_by_file
5353find_by_ambig pick_by_ambig del_by_ambig find_by_length
54- del_by_length codon_sim codon_info trim_ends gff_pos) ;
54+ del_by_length codon_sim codon_info trim_ends gff_pos gff_pos_file ) ;
5555
5656# Package global variables
5757my ($in , $out , $seq , %opts , $filename , $in_format , $out_format , $guesser );
@@ -61,46 +61,47 @@ my $VERSION = '1.0';
6161# # For new options, just add an entry into this table with the same key as in
6262# # the GetOpts function in the main program. Make the key be a reference to the handler subroutine (defined below), and test that it works.
6363my %opt_dispatch = (
64+ ' anonymize' => \&anonymize,
65+ ' break' => \&shred_seq,
6466 ' codon-table' => \&codon_table,
6567# 'codon-sim' => \&codon_sim,
6668 ' codon-info' => \&codon_info,
67- ' iep' => \&iso_electric_point,
6869 ' composition' => \&print_composition,
69- ' mol-wt ' => \&print_weight ,
70+ ' count-codons ' => \&count_codons ,
7071 ' delete' => \&filter_seqs,
72+ ' feat2fas' => \&print_gb_gene_feats,
7173 ' fetch' => \&retrieve_seqs,
7274 ' gff-pos' => \&gff_pos,
73- ' no-gaps' => \&remove_gaps,
75+ ' gff-pos-file' => \&gff_pos_file,
76+ ' hydroB' => \&hydroB,
77+ ' iep' => \&iso_electric_point,
78+ ' lead-gaps' => \&count_leading_gaps,
7479 ' length' => \&print_lengths,
80+ ' linearize' => \&linearize,
7581 ' longest-orf' => \&update_longest_orf,
82+ ' mol-wt' => \&print_weight,
83+ ' no-gaps' => \&remove_gaps,
7684 ' num-seq' => \&print_seq_count,
7785 ' num-gaps-dna' => \&count_gaps_dna,
7886 ' num-gaps-aa' => \&count_gaps_aa,
7987 ' pick' => \&filter_seqs,
80- ' revcom' => \&make_revcom,
81- ' subseq' => \&print_subseq,
82- ' translate' => \&reading_frame_ops,
83- # 'restrict-coord' => \&restrict_coord,
84- # 'restrict' => \&restrict_digest,
85- ' anonymize' => \&anonymize,
86- ' break' => \&shred_seq,
87- ' count-codons' => \&count_codons,
88- ' feat2fas' => \&print_gb_gene_feats,
89- ' lead-gaps' => \&count_leading_gaps,
90- ' hydroB' => \&hydroB,
91- ' linearize' => \&linearize,
9288 ' reloop' => \&reloop_at,
9389 ' remove-stop' => \&remove_stop,
94- ' sort' => \&sort_by,
90+ ' rename' => \&rename_id,
91+ ' revcom' => \&make_revcom,
92+ ' subseq' => \&print_subseq,
93+ # 'restrict-coord' => \&restrict_coord,
94+ # 'restrict' => \&restrict_digest,
9595 ' split-cdhit' => \&split_cdhit,
96+ ' sort' => \&sort_by,
9697 ' syn-code' => \&synon_codons,
9798# 'dotplot' => \&draw_dotplot,
9899# 'extract' => \&reading_frame_ops,
99100# 'longest-orf' => \&reading_frame_ops,
100101# 'prefix' => \&anonymize,
101- ' rename' => \&rename_id,
102102# 'slidingwindow' => \&sliding_window,
103- ' trim-ends' => \&trim_ends,
103+ ' translate' => \&reading_frame_ops,
104+ ' trim-ends' => \&trim_ends,
104105 );
105106
106107my %filter_dispatch = (
@@ -158,7 +159,7 @@ sub initialize {
158159# }
159160# $in_format = $guesser->guess() unless $opts{'input'};
160161
161- $in_format = $opts {' gff-pos' } ? ' genbank' : $opts {" input" } ? $opts {" input" } : ' fasta' ;
162+ $in_format = $opts {' gff-pos' } || $opts { ' gff-pos-file ' } ? ' genbank' : $opts {" input" } ? $opts {" input" } : ' fasta' ;
162163
163164# die "Reads only fasta, fastq, embl, genbank. Not aligment file formats like clustalw\n" unless $in_format =~ /fasta|fastq|embl|genbank/;
164165 $in = Bio::SeqIO-> new(-format => $in_format , ($filename eq " STDIN" )? (-fh => \*STDIN ) : (-file => $filename ));
@@ -194,29 +195,23 @@ sub write_out {
194195
195196# ################## subroutines ########################
196197
197- sub gff_pos {
198- my $pos = $opts { " gff-pos " };
198+ # this is used by the two gff-pos functions
199+ sub _gb2orf {
199200 die " $filename : Not a GenBank file. Quit\n " unless $in_format eq ' genbank' ;
200201 $seq = $in -> next_seq();
201- die " $0 --gff-pos\n " unless $pos >= 1 && $pos <= $seq -> length ();
202202 my $gene_count = 0;
203203 my @orfs ;
204- push @orfs , {
205- id => ' query' ,
206- start => $pos ,
207- end => $pos ,
208- strand => undef
209- };
210-
211204 my $last_end ;
205+ my $order = 1;
212206 foreach my $feat ($seq -> get_SeqFeatures()) {
213207 if ($feat -> primary_tag eq ' CDS' ) {
214208 if ($gene_count == 0) { # add 5-UTR
215209 push @orfs , {
216210 id => ' 5UTR' ,
217211 start => 1,
218212 end => $feat -> start() - 1,
219- strand => undef
213+ strand => undef ,
214+ order => $order ++
220215 };
221216 }
222217
@@ -228,9 +223,10 @@ sub gff_pos {
228223 }
229224 push @orfs , {
230225 id => $gene_tag ,
231- start => $feat -> start,
232- end => $feat -> end,
233- strand => $feat -> strand
226+ start => $feat -> start + 0,
227+ end => $feat -> end() + 0,
228+ strand => $feat -> strand + 0,
229+ order => $order ++
234230 };
235231 }
236232 $last_end = $feat -> end();
@@ -240,11 +236,96 @@ sub gff_pos {
240236 id => ' 3UTR' ,
241237 start => $last_end + 1,
242238 end => $seq -> length (),
239+ strand => undef ,
240+ order => $order
241+ };
242+ return \@orfs ;
243+ }
244+
245+ sub gff_pos_file {
246+ open POS, " <" , $opts {" gff-pos-file" } || die " File not found:" , $opts {" gff-pos-file" }, " \n " ;
247+ my @queries ;
248+ my $ct_pos = 1;
249+ while (<POS>){
250+ chomp ;
251+ my @data = split /\t/, $_ ; # first col contains genome positions
252+ next unless $data [0] =~ / ^\d +$ / ;
253+ my $pos = shift @data ;
254+ push @queries , {
255+ id => ' query' ,
256+ start => $pos + 0,
257+ data => \@data ,
258+ order => $ct_pos ++
259+ };
260+ }
261+ close POS;
262+ # print Dumper(\@pos); exit;
263+
264+ my $ref_orfs = _gb2orf();
265+ my @orfs = @$ref_orfs ;
266+ @orfs = sort {$a -> {start } <=> $b -> {start }} @orfs ;
267+ # print Dumper(\@orfs); exit;
268+ # print Dumper(\@queries); exit;
269+
270+ my @output ;
271+ my @mixed = sort {$a -> {start } <=> $b -> {start }} (@orfs , @queries );
272+ # print Dumper(\@mixed); exit;
273+ my $previous_orf ;
274+ my $next_orf ;
275+ for (my $i = 0; $i < @mixed ; $i ++) {
276+ if ($mixed [$i ]-> {id } eq ' query' ) {
277+ ($next_orf ) = grep {$_ -> {order } == $previous_orf -> {order } + 1} @orfs ;
278+ push @output , {
279+ query => $mixed [$i ],
280+ orf_before => $previous_orf ,
281+ orf_after => $next_orf
282+ };
283+ } else {
284+ $previous_orf = $mixed [$i ];
285+ }
286+ }
287+ &_print_positions(\@output )
288+ }
289+
290+ sub _print_positions {
291+ my $ref = shift @_ ;
292+ my @out = @$ref ;
293+ foreach (sort {$a -> {query }-> {order } <=> $b -> {query }-> {order }} @out ) {
294+ my $query = $_ -> {query };
295+ my $before = $_ -> {orf_before };
296+ my $after = $_ -> {orf_after };
297+
298+ my ($tag_before , $tag_after );
299+ my $pos = $query -> {start };
300+ my $ref_data = $query -> {data };
301+
302+ if ($pos <= $before -> {end }) {
303+ $tag_before = $before -> {id };
304+ $tag_after = $before -> {id };
305+ } elsif ($pos > $before -> {end } && $pos < $after -> {start }) {
306+ $tag_before = $before -> {id };
307+ $tag_after = $after -> {id };
308+ } else {
309+ $tag_before = $after -> {id };
310+ $tag_after = $after -> {id };
311+ }
312+ print join " \t " , ($pos , $tag_before , $tag_after , @$ref_data ), " \n " ;
313+ }
314+ }
315+
316+ sub gff_pos {
317+ my $pos = $opts {" gff-pos" };
318+ my $ref_orfs = _gb2orf();
319+ my @orfs = @$ref_orfs ;
320+
321+ push @orfs , {
322+ id => ' query' ,
323+ start => $pos ,
324+ end => $pos ,
243325 strand => undef
244326 };
245327
246328 @orfs = sort {$a -> {start } <=> $b -> {start }} @orfs ;
247- # print Dumper(\@orfs); exit;
248329 my $order_query = 0;
249330 for (my $i = 0; $i < @orfs ; $i ++) {
250331 $order_query = $i if $orfs [$i ]-> {id } eq ' query' ;
0 commit comments