Skip to content

Commit d1ce3e8

Browse files
author
wqiu
committed
bioseq: add --gff-pos-orf, the batch version of --gff-pos
1 parent 8dadd4f commit d1ce3e8

File tree

2 files changed

+124
-36
lines changed

2 files changed

+124
-36
lines changed

bin/bioseq

Lines changed: 8 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -33,6 +33,7 @@ GetOptions(
3333
# "dotplot|D=s",
3434
"hydroB|H",
3535
"gff-pos=i",
36+
"gff-pos-file=s",
3637
"iep",
3738
"input|i=s",
3839
"lead-gaps|G",
@@ -222,7 +223,13 @@ Retrieves a sequence from GenBank using the provided accession number, e.g.,
222223

223224
Given a genbank file, find the genome location of a given position.
224225

225-
The coord file is a 4-tab delimited file where each row is a gene location, consisting of locus_tag, begin, end, and strand.
226+
This is inspired by the need to parse ClonalFrameML output files (ML alignment) to identify the locus tags of SNP patterns.
227+
228+
The output is the form of locus_tag1:locus_tag2, where the two tags are the same if the position falls on a coding region and different otherwise.
229+
230+
=item --gff-pos-file <file> <genbank_file>
231+
232+
Given a genbank file, find the genome locations in an input file. It is the batch version of the previous option --gff-pos. The first col contains genome positions.
226233

227234
The output is the form of locus_tag1:locus_tag2, where the two tags are the same if the position falls on a coding region and different otherwise.
228235

lib/Bio/BPWrapper/SeqManipulations.pm

Lines changed: 116 additions & 35 deletions
Original file line numberDiff line numberDiff line change
@@ -51,7 +51,7 @@ pick_by_id del_by_id find_by_re
5151
pick_by_re del_by_re
5252
pick_by_file del_by_file
5353
find_by_ambig pick_by_ambig del_by_ambig find_by_length
54-
del_by_length codon_sim codon_info trim_ends gff_pos);
54+
del_by_length codon_sim codon_info trim_ends gff_pos gff_pos_file);
5555

5656
# Package global variables
5757
my ($in, $out, $seq, %opts, $filename, $in_format, $out_format, $guesser);
@@ -61,46 +61,47 @@ my $VERSION = '1.0';
6161
## For new options, just add an entry into this table with the same key as in
6262
## the GetOpts function in the main program. Make the key be a reference to the handler subroutine (defined below), and test that it works.
6363
my %opt_dispatch = (
64+
'anonymize' => \&anonymize,
65+
'break' => \&shred_seq,
6466
'codon-table' => \&codon_table,
6567
# 'codon-sim' => \&codon_sim,
6668
'codon-info' => \&codon_info,
67-
'iep' => \&iso_electric_point,
6869
'composition' => \&print_composition,
69-
'mol-wt' => \&print_weight,
70+
'count-codons' => \&count_codons,
7071
'delete' => \&filter_seqs,
72+
'feat2fas' => \&print_gb_gene_feats,
7173
'fetch' => \&retrieve_seqs,
7274
'gff-pos' => \&gff_pos,
73-
'no-gaps' => \&remove_gaps,
75+
'gff-pos-file' => \&gff_pos_file,
76+
'hydroB' => \&hydroB,
77+
'iep' => \&iso_electric_point,
78+
'lead-gaps' => \&count_leading_gaps,
7479
'length' => \&print_lengths,
80+
'linearize' => \&linearize,
7581
'longest-orf' => \&update_longest_orf,
82+
'mol-wt' => \&print_weight,
83+
'no-gaps' => \&remove_gaps,
7684
'num-seq' => \&print_seq_count,
7785
'num-gaps-dna' => \&count_gaps_dna,
7886
'num-gaps-aa' => \&count_gaps_aa,
7987
'pick' => \&filter_seqs,
80-
'revcom' => \&make_revcom,
81-
'subseq' => \&print_subseq,
82-
'translate' => \&reading_frame_ops,
83-
# 'restrict-coord' => \&restrict_coord,
84-
# 'restrict' => \&restrict_digest,
85-
'anonymize' => \&anonymize,
86-
'break' => \&shred_seq,
87-
'count-codons' => \&count_codons,
88-
'feat2fas' => \&print_gb_gene_feats,
89-
'lead-gaps' => \&count_leading_gaps,
90-
'hydroB' => \&hydroB,
91-
'linearize' => \&linearize,
9288
'reloop' => \&reloop_at,
9389
'remove-stop' => \&remove_stop,
94-
'sort' => \&sort_by,
90+
'rename' => \&rename_id,
91+
'revcom' => \&make_revcom,
92+
'subseq' => \&print_subseq,
93+
# 'restrict-coord' => \&restrict_coord,
94+
# 'restrict' => \&restrict_digest,
9595
'split-cdhit' => \&split_cdhit,
96+
'sort' => \&sort_by,
9697
'syn-code' => \&synon_codons,
9798
# 'dotplot' => \&draw_dotplot,
9899
# 'extract' => \&reading_frame_ops,
99100
# 'longest-orf' => \&reading_frame_ops,
100101
# 'prefix' => \&anonymize,
101-
'rename' => \&rename_id,
102102
# 'slidingwindow' => \&sliding_window,
103-
'trim-ends' => \&trim_ends,
103+
'translate' => \&reading_frame_ops,
104+
'trim-ends' => \&trim_ends,
104105
);
105106

106107
my %filter_dispatch = (
@@ -158,7 +159,7 @@ sub initialize {
158159
# }
159160
# $in_format = $guesser->guess() unless $opts{'input'};
160161

161-
$in_format = $opts{'gff-pos'} ? 'genbank' : $opts{"input"} ? $opts{"input"} : 'fasta';
162+
$in_format = $opts{'gff-pos'} || $opts{'gff-pos-file'} ? 'genbank' : $opts{"input"} ? $opts{"input"} : 'fasta';
162163

163164
# die "Reads only fasta, fastq, embl, genbank. Not aligment file formats like clustalw\n" unless $in_format =~ /fasta|fastq|embl|genbank/;
164165
$in = Bio::SeqIO->new(-format => $in_format, ($filename eq "STDIN")? (-fh => \*STDIN) : (-file => $filename));
@@ -194,29 +195,23 @@ sub write_out {
194195

195196
################### subroutines ########################
196197

197-
sub gff_pos {
198-
my $pos = $opts{"gff-pos"};
198+
# this is used by the two gff-pos functions
199+
sub _gb2orf {
199200
die "$filename: Not a GenBank file. Quit\n" unless $in_format eq 'genbank';
200201
$seq = $in->next_seq();
201-
die "$0 --gff-pos\n" unless $pos >= 1 && $pos <= $seq->length();
202202
my $gene_count = 0;
203203
my @orfs;
204-
push @orfs, {
205-
id => 'query',
206-
start => $pos,
207-
end => $pos,
208-
strand => undef
209-
};
210-
211204
my $last_end;
205+
my $order = 1;
212206
foreach my $feat ($seq->get_SeqFeatures()) {
213207
if ($feat->primary_tag eq 'CDS') {
214208
if ($gene_count == 0) { # add 5-UTR
215209
push @orfs, {
216210
id => '5UTR',
217211
start => 1,
218212
end => $feat->start() - 1,
219-
strand => undef
213+
strand => undef,
214+
order => $order++
220215
};
221216
}
222217

@@ -228,9 +223,10 @@ sub gff_pos {
228223
}
229224
push @orfs, {
230225
id => $gene_tag,
231-
start => $feat->start,
232-
end => $feat->end,
233-
strand => $feat->strand
226+
start => $feat->start + 0,
227+
end => $feat->end() + 0,
228+
strand => $feat->strand + 0,
229+
order => $order++
234230
};
235231
}
236232
$last_end = $feat->end();
@@ -240,11 +236,96 @@ sub gff_pos {
240236
id => '3UTR',
241237
start => $last_end + 1,
242238
end => $seq->length(),
239+
strand => undef,
240+
order => $order
241+
};
242+
return \@orfs;
243+
}
244+
245+
sub gff_pos_file {
246+
open POS, "<", $opts{"gff-pos-file"} || die "File not found:", $opts{"gff-pos-file"}, "\n";
247+
my @queries;
248+
my $ct_pos = 1;
249+
while(<POS>){
250+
chomp;
251+
my @data = split /\t/, $_; # first col contains genome positions
252+
next unless $data[0] =~ /^\d+$/;
253+
my $pos = shift @data;
254+
push @queries, {
255+
id => 'query',
256+
start => $pos + 0,
257+
data => \@data,
258+
order => $ct_pos++
259+
};
260+
}
261+
close POS;
262+
# print Dumper(\@pos); exit;
263+
264+
my $ref_orfs = _gb2orf();
265+
my @orfs = @$ref_orfs;
266+
@orfs = sort{$a->{start} <=> $b->{start}} @orfs;
267+
# print Dumper(\@orfs); exit;
268+
# print Dumper(\@queries); exit;
269+
270+
my @output;
271+
my @mixed = sort{$a->{start} <=> $b->{start}} (@orfs, @queries);
272+
# print Dumper(\@mixed); exit;
273+
my $previous_orf;
274+
my $next_orf;
275+
for(my $i = 0; $i < @mixed; $i++) {
276+
if ($mixed[$i]->{id} eq 'query') {
277+
($next_orf) = grep {$_->{order} == $previous_orf->{order} + 1} @orfs;
278+
push @output, {
279+
query => $mixed[$i],
280+
orf_before => $previous_orf,
281+
orf_after => $next_orf
282+
};
283+
} else {
284+
$previous_orf = $mixed[$i];
285+
}
286+
}
287+
&_print_positions(\@output)
288+
}
289+
290+
sub _print_positions {
291+
my $ref = shift @_;
292+
my @out = @$ref;
293+
foreach (sort {$a->{query}->{order} <=> $b->{query}->{order}} @out ) {
294+
my $query = $_->{query};
295+
my $before = $_->{orf_before};
296+
my $after = $_->{orf_after};
297+
298+
my ($tag_before, $tag_after);
299+
my $pos = $query->{start};
300+
my $ref_data = $query->{data};
301+
302+
if ($pos <= $before->{end}) {
303+
$tag_before = $before->{id};
304+
$tag_after = $before->{id};
305+
} elsif ($pos > $before->{end} && $pos < $after->{start}) {
306+
$tag_before = $before->{id};
307+
$tag_after = $after->{id};
308+
} else {
309+
$tag_before = $after->{id};
310+
$tag_after = $after->{id};
311+
}
312+
print join "\t", ($pos, $tag_before, $tag_after, @$ref_data), "\n";
313+
}
314+
}
315+
316+
sub gff_pos {
317+
my $pos = $opts{"gff-pos"};
318+
my $ref_orfs = _gb2orf();
319+
my @orfs = @$ref_orfs;
320+
321+
push @orfs, {
322+
id => 'query',
323+
start => $pos,
324+
end => $pos,
243325
strand => undef
244326
};
245327

246328
@orfs = sort{$a->{start} <=> $b->{start}} @orfs;
247-
# print Dumper(\@orfs); exit;
248329
my $order_query = 0;
249330
for(my $i = 0; $i < @orfs; $i++) {
250331
$order_query = $i if $orfs[$i]->{id} eq 'query';

0 commit comments

Comments
 (0)