Skip to content

Commit 1dec648

Browse files
author
wqiu
committed
bioseq: add --gff-pos
1 parent ab535c4 commit 1dec648

File tree

2 files changed

+33
-2
lines changed

2 files changed

+33
-2
lines changed

bin/bioseq

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -32,6 +32,7 @@ GetOptions(
3232
# "extract|e",
3333
# "dotplot|D=s",
3434
"hydroB|H",
35+
"gff-pos=i",
3536
"iep",
3637
"input|i=s",
3738
"lead-gaps|G",
@@ -217,6 +218,14 @@ Extract gene sequences in FASTA from a GenBank file of bacterial genome. Won't w
217218
Retrieves a sequence from GenBank using the provided accession number, e.g.,
218219
bioseq -f 'NC_003078' -o 'genbank'
219220

221+
=item --gff-pos <pos> <genbank_file>
222+
223+
Given a genbank file, find the genome location of a given position.
224+
225+
The coord file is a 4-tab delimited file where each row is a gene location, consisting of locus_tag, begin, end, and strand.
226+
227+
The output is the form of locus_tag1:locus_tag2, where the two tags are the same if the position falls on a coding region and different otherwise.
228+
220229
=item --hydroB, -H
221230

222231
Return the mean Kyte-Doolittle hydropathicity for protein sequences.

lib/Bio/BPWrapper/SeqManipulations.pm

Lines changed: 24 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -51,7 +51,7 @@ pick_by_id del_by_id find_by_re
5151
pick_by_re del_by_re
5252
pick_by_file del_by_file
5353
find_by_ambig pick_by_ambig del_by_ambig find_by_length
54-
del_by_length codon_sim codon_info trim_ends);
54+
del_by_length codon_sim codon_info trim_ends gff_pos);
5555

5656
# Package global variables
5757
my ($in, $out, $seq, %opts, $filename, $in_format, $out_format, $guesser);
@@ -69,6 +69,7 @@ my %opt_dispatch = (
6969
'mol-wt' => \&print_weight,
7070
'delete' => \&filter_seqs,
7171
'fetch' => \&retrieve_seqs,
72+
'gff-pos' => \&gff_pos,
7273
'no-gaps' => \&remove_gaps,
7374
'length' => \&print_lengths,
7475
'longest-orf' => \&update_longest_orf,
@@ -157,7 +158,7 @@ sub initialize {
157158
# }
158159
# $in_format = $guesser->guess() unless $opts{'input'};
159160

160-
$in_format = $opts{"input"} // 'fasta';
161+
$in_format = $opts{'gff-pos'} ? 'genbank' : $opts{"input"} ? $opts{"input"} : 'fasta';
161162

162163
# die "Reads only fasta, fastq, embl, genbank. Not aligment file formats like clustalw\n" unless $in_format =~ /fasta|fastq|embl|genbank/;
163164
$in = Bio::SeqIO->new(-format => $in_format, ($filename eq "STDIN")? (-fh => \*STDIN) : (-file => $filename));
@@ -193,6 +194,27 @@ sub write_out {
193194

194195
################### subroutines ########################
195196

197+
sub gff_pos {
198+
my $pos = $opts{"gff-pos"};
199+
die "$filename: Not a GenBank file. Quit\n" unless $in_format eq 'genbank';
200+
$seq = $in->next_seq();
201+
my $gene_count = 0;
202+
foreach my $feat ($seq->get_SeqFeatures()) {
203+
if ($feat->primary_tag eq 'CDS') {
204+
my $location = $feat->location();
205+
next if $location->isa('Bio::Location::Split');
206+
my $gene_tag = "gene_" . $gene_count++;
207+
my $gene_symbol = 'na';
208+
my $product = 'na';
209+
foreach my $tag ($feat->get_all_tags()) {
210+
($gene_tag) = $feat->get_tag_values($tag) if $tag eq 'locus_tag';
211+
}
212+
print $gene_tag, "\n";
213+
}
214+
}
215+
}
216+
217+
196218
# from vapid trim python code
197219
sub trim_ends {
198220
while ($seq = $in->next_seq()) {

0 commit comments

Comments
 (0)