bioaln: add orf-slice

wqiu · wqiu · commit e68b7153e493 · 2025-07-31T05:58:28.000-04:00
diff --git a/bin/bioaln b/bin/bioaln
@@ -59,6 +59,7 @@ GetOptions(
     "select-third",
     "shuffle-sites|S",
     "slice|s=s",
+    "slice-orfs=s",
     "split-cdhit=s",
     "trim-ends",
     "uniq|u",
@@ -416,7 +417,7 @@ Generate an alignment of every-third (mostly synonymous) bases (assuming a CDS a
 
 Make a shuffled (not bootstrapped, which is sampling I<with> replacement) alignment. This operation I<permutes> alignment columns. It is used for testing the significance of long-runs of conserved sites in an alignment (e.g., conserved intergenic spacer sequences).
 
-=item --slice, -s 'start|-,end|-'; or -s"file:orfs.tsv"
+=item --slice, -s 'start|-,end|-'
 
 Get a slice of the alignment.
 
@@ -425,10 +426,15 @@ Using a '-' character in the first or second position defaults to the beginning
  --slice '20,80' or --slice '20,80' or -s='20,80' or --slice='20,80': Slice from position 20 to 80, inclusive.
  --slice '-,80':  Slice from beginning up to, and including position 80
  --slice '20,-':  Slice from position 20 up to, and including, the end of the alignment
- --slice 'file:orfs.tsv': slice using coordinates from a file. The file should contain 4 tab/space-delimited columns: locus_tag (no space), start (numerical), end (numerical), strand(0/1)
 
 NOTE: --slice'-,x' (where x is '-' or a position) without a space does NOT work. Use --slice='-,x' (or a space in place of =) instead.
 
+=item --slice-orfs orfs.tsv
+
+Get slices of the alignment based on an input interval file. The file should contain 4 tab/space-delimited columns: locus_tag (no space), start (numerical), end (numerical), strand(negative strand marked as 0, -1, or "-").
+
+Each line of the interval file would generate a single alignment file. This method is designed with a gff file in mind, each line defines an ORF location.  The input file is an in-frame whole-genome alignment originating from a VCF file made with a reference genome. This method would (ideally) split the whole-genome alignment into ORF-by-ORF in-frame alignments.
+
 =item --split-cdhit <cdhit clrs file>
 
 Generate alignment for each CDHIT family (based on .clrs file). Ignore if you don't use cdhit for family clustering.
diff --git a/lib/Bio/BPWrapper/AlnManipulations.pm b/lib/Bio/BPWrapper/AlnManipulations.pm
@@ -45,7 +45,7 @@ avg_id_by_win concat conserve_blocks get_consensus dns_to_protein
 remove_gapped_cols_in_one_seq colnum_from_residue_pos
 list_ids premute_states protein_to_dna sample_seqs
 shuffle_sites random_slice select_third_sites remove_third_sites
-upper_case );
+upper_case slice_orfs);
 
 use Bio::BPWrapper;
 # Package global variables
@@ -70,6 +70,7 @@ my %opt_dispatch = (
     "pick" => \&pick_seqs,
     "ref-seq" => \&change_ref,
     "slice" => \&aln_slice,
+    "slice-orfs" => \&orf_slice,
     "split-cdhit" => \&split_cdhit,
     "uniq" => \&get_unique,
     "var-sites" => \&variable_sites,
@@ -807,39 +808,56 @@ with improvements.
 
 =cut
 
-
 sub aln_slice {    # get alignment slice
     my $opt_str = $opts{"slice"};
-    my $id;
-    my $begin;
-    my $end;
-    my $strand;
-    if ($opt_str =~ /^file:(\S+)$/) {
-	my $fname = $1;
-	open COORD, "<", $fname;
-	while(<COORD>) {
-	    chomp;
-	    ($id, $begin, $end, $strand) = split;
-	}
-    } else {
-	($begin, $end) = split(/\s*,\s*/, $opt_str);
-    }
+    my ($begin, $end) = split(/\s*,\s*/, $opt_str);
     
     # Allow for one parameter to be omitted. Default $begin to the
     # beginning of the alignment, and $end to the end.
     $begin = 1            if $begin eq "-";
     $end   = $aln->length if $end   eq "-";
     $aln = $aln->slice($begin, $end);
-    if ($strand && $strand == 0) {
+}
+
+sub orf_slice {    # get alignment slice
+    my $opt_str = $opts{"slice-orfs"};
+    my @orf_coords;
+    $opt_str =~ /^(\S+)$/;
+    my $fname = $1;
+    open COORD, "<", $fname;
+    while(<COORD>) {
+	chomp;
+	my ($id, $begin, $end, $strand) = split;
+	push @orf_coords, {
+	    id => $id,
+	    begin => $begin,
+	    end => $end,
+	    strand => $strand
+	}
+    }
+
+    foreach (@orf_coords) {
 	my $new_aln = Bio::SimpleAlign -> new();
-	foreach ($aln->each_seq) {
-	    my $revcom = $_ -> revcom();
-	    my $end = $_ -> end;
-	    $revcom->end($end);
-	    $new_aln->add_seq($revcom);
+	my $orf = $_->{id};
+	my $begin = $_ -> {begin};
+	my $end = $_ -> {end};
+	my $strand = $_ -> {strand};
+	my $fname = $orf . ".aln";
+	my $slice = $aln->slice($begin, $end);
+	if ($strand < 1 || $strand eq '-') { # could be 0, -1, or -
+	    my $new_slice = Bio::SimpleAlign -> new();
+	    foreach ($slice->each_seq) {
+		my $revcom = $_ -> revcom();
+		my $end = $_ -> end;
+		$revcom->end($end);
+		$new_slice->add_seq($revcom);
+	    }
+	    $slice = $new_slice;
 	}
-	$aln = $new_aln;
+	my $out = Bio::AlignIO -> new(-file => ">$fname", -format => "fasta");
+	$out->write_aln($slice);
     }
+    exit;
 }
 
 =head2 get_unique()