Skip to content

Commit e68b715

Browse files
author
wqiu
committed
bioaln: add orf-slice
1 parent 7a05f06 commit e68b715

File tree

2 files changed

+49
-25
lines changed

2 files changed

+49
-25
lines changed

bin/bioaln

Lines changed: 8 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -59,6 +59,7 @@ GetOptions(
5959
"select-third",
6060
"shuffle-sites|S",
6161
"slice|s=s",
62+
"slice-orfs=s",
6263
"split-cdhit=s",
6364
"trim-ends",
6465
"uniq|u",
@@ -416,7 +417,7 @@ Generate an alignment of every-third (mostly synonymous) bases (assuming a CDS a
416417

417418
Make a shuffled (not bootstrapped, which is sampling I<with> replacement) alignment. This operation I<permutes> alignment columns. It is used for testing the significance of long-runs of conserved sites in an alignment (e.g., conserved intergenic spacer sequences).
418419

419-
=item --slice, -s 'start|-,end|-'; or -s"file:orfs.tsv"
420+
=item --slice, -s 'start|-,end|-'
420421

421422
Get a slice of the alignment.
422423

@@ -425,10 +426,15 @@ Using a '-' character in the first or second position defaults to the beginning
425426
--slice '20,80' or --slice '20,80' or -s='20,80' or --slice='20,80': Slice from position 20 to 80, inclusive.
426427
--slice '-,80': Slice from beginning up to, and including position 80
427428
--slice '20,-': Slice from position 20 up to, and including, the end of the alignment
428-
--slice 'file:orfs.tsv': slice using coordinates from a file. The file should contain 4 tab/space-delimited columns: locus_tag (no space), start (numerical), end (numerical), strand(0/1)
429429

430430
NOTE: --slice'-,x' (where x is '-' or a position) without a space does NOT work. Use --slice='-,x' (or a space in place of =) instead.
431431

432+
=item --slice-orfs orfs.tsv
433+
434+
Get slices of the alignment based on an input interval file. The file should contain 4 tab/space-delimited columns: locus_tag (no space), start (numerical), end (numerical), strand(negative strand marked as 0, -1, or "-").
435+
436+
Each line of the interval file would generate a single alignment file. This method is designed with a gff file in mind, each line defines an ORF location. The input file is an in-frame whole-genome alignment originating from a VCF file made with a reference genome. This method would (ideally) split the whole-genome alignment into ORF-by-ORF in-frame alignments.
437+
432438
=item --split-cdhit <cdhit clrs file>
433439

434440
Generate alignment for each CDHIT family (based on .clrs file). Ignore if you don't use cdhit for family clustering.

lib/Bio/BPWrapper/AlnManipulations.pm

Lines changed: 41 additions & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -45,7 +45,7 @@ avg_id_by_win concat conserve_blocks get_consensus dns_to_protein
4545
remove_gapped_cols_in_one_seq colnum_from_residue_pos
4646
list_ids premute_states protein_to_dna sample_seqs
4747
shuffle_sites random_slice select_third_sites remove_third_sites
48-
upper_case );
48+
upper_case slice_orfs);
4949

5050
use Bio::BPWrapper;
5151
# Package global variables
@@ -70,6 +70,7 @@ my %opt_dispatch = (
7070
"pick" => \&pick_seqs,
7171
"ref-seq" => \&change_ref,
7272
"slice" => \&aln_slice,
73+
"slice-orfs" => \&orf_slice,
7374
"split-cdhit" => \&split_cdhit,
7475
"uniq" => \&get_unique,
7576
"var-sites" => \&variable_sites,
@@ -807,39 +808,56 @@ with improvements.
807808
808809
=cut
809810

810-
811811
sub aln_slice { # get alignment slice
812812
my $opt_str = $opts{"slice"};
813-
my $id;
814-
my $begin;
815-
my $end;
816-
my $strand;
817-
if ($opt_str =~ /^file:(\S+)$/) {
818-
my $fname = $1;
819-
open COORD, "<", $fname;
820-
while(<COORD>) {
821-
chomp;
822-
($id, $begin, $end, $strand) = split;
823-
}
824-
} else {
825-
($begin, $end) = split(/\s*,\s*/, $opt_str);
826-
}
813+
my ($begin, $end) = split(/\s*,\s*/, $opt_str);
827814

828815
# Allow for one parameter to be omitted. Default $begin to the
829816
# beginning of the alignment, and $end to the end.
830817
$begin = 1 if $begin eq "-";
831818
$end = $aln->length if $end eq "-";
832819
$aln = $aln->slice($begin, $end);
833-
if ($strand && $strand == 0) {
820+
}
821+
822+
sub orf_slice { # get alignment slice
823+
my $opt_str = $opts{"slice-orfs"};
824+
my @orf_coords;
825+
$opt_str =~ /^(\S+)$/;
826+
my $fname = $1;
827+
open COORD, "<", $fname;
828+
while(<COORD>) {
829+
chomp;
830+
my ($id, $begin, $end, $strand) = split;
831+
push @orf_coords, {
832+
id => $id,
833+
begin => $begin,
834+
end => $end,
835+
strand => $strand
836+
}
837+
}
838+
839+
foreach (@orf_coords) {
834840
my $new_aln = Bio::SimpleAlign -> new();
835-
foreach ($aln->each_seq) {
836-
my $revcom = $_ -> revcom();
837-
my $end = $_ -> end;
838-
$revcom->end($end);
839-
$new_aln->add_seq($revcom);
841+
my $orf = $_->{id};
842+
my $begin = $_ -> {begin};
843+
my $end = $_ -> {end};
844+
my $strand = $_ -> {strand};
845+
my $fname = $orf . ".aln";
846+
my $slice = $aln->slice($begin, $end);
847+
if ($strand < 1 || $strand eq '-') { # could be 0, -1, or -
848+
my $new_slice = Bio::SimpleAlign -> new();
849+
foreach ($slice->each_seq) {
850+
my $revcom = $_ -> revcom();
851+
my $end = $_ -> end;
852+
$revcom->end($end);
853+
$new_slice->add_seq($revcom);
854+
}
855+
$slice = $new_slice;
840856
}
841-
$aln = $new_aln;
857+
my $out = Bio::AlignIO -> new(-file => ">$fname", -format => "fasta");
858+
$out->write_aln($slice);
842859
}
860+
exit;
843861
}
844862

845863
=head2 get_unique()

0 commit comments

Comments
 (0)