improve speed for masking gappy columns

y3tseng · y3tseng · commit 36f72b716b52 · 2025-12-17T13:20:06.000-08:00
diff --git a/install/installIterative.sh b/install/installIterative.sh
@@ -7,9 +7,8 @@ conda config --add channels bioconda
 conda config --add channels conda-forge
 conda config --set channel_priority strict
 
-conda install snakemake -y
-conda install ete3 -y
-conda install numpy -y
+conda install -y snakemake ete3 numpy numba
+
 
 # Get system architecture
 ARCH=$(uname -m)
diff --git a/workflow/config.yaml b/workflow/config.yaml
@@ -34,11 +34,11 @@ gapextend: -5
 matrix: ""
 
 # Iterative Mode
-mask_gappy: 0.995                         # Minimum proportion of gappy sites that would be mask before proceed to tree inference step
+mask_gappy: 0.95                         # Minimum proportion of gappy sites that would be mask before proceed to tree inference step
 
 
 # IQ-TREE Model Selection
 # Automatically determine the best model without specifying one, though this may take some time.
 # If you would like to speed up the process, please specify a model (e.g., "-m GTR"). 
 # See "https://iqtree.github.io/doc/Substitution-Models" for more details on available models.
-iqtree_model: ""
+iqtree_model: ""
diff --git a/workflow/rules/fasttree.smk b/workflow/rules/fasttree.smk
@@ -11,7 +11,7 @@ rule fasttree:
     threads: config["num_threads"]
     shell:
         '''
-        python3 scripts/reduceLen.py {input.msa} {params.tempFile} {params.threshold}
+        python3 scripts/reduceLen.py --threads {threads} {input.msa} {params.tempFile} {params.threshold}
         export OMP_NUM_THREADS={threads}
         {params.fasttree_exe} {params.model} -fastest {params.tempFile} > {params.tempTree} 
         python3 scripts/resolveTree.py {params.tempTree} {output.tree}
diff --git a/workflow/rules/iqtree.smk b/workflow/rules/iqtree.smk
@@ -11,7 +11,7 @@ rule iqtree:
     threads: config["num_threads"]
     shell:
         '''
-        python3 scripts/reduceLen.py {input.msa} {params.tempFile} {params.threshold}
+        python3 scripts/reduceLen.py --threads {threads} {input.msa} {params.tempFile} {params.threshold}
         {params.iqtree_exe} -s {params.tempFile} {params.model} --threads-max {threads}
         mv {params.temp}/msa.mask.fa.treefile {output}
         rm {params.temp}/msa.mask.fa.*
diff --git a/workflow/rules/raxml.smk b/workflow/rules/raxml.smk
@@ -10,7 +10,7 @@ rule raxml:
     threads: config["num_threads"]
     shell:
         '''
-        python3 scripts/reduceLen.py {input.msa} {params.tempFile} {params.threshold}
+        python3 scripts/reduceLen.py --threads {threads} {input.msa} {params.tempFile} {params.threshold}
         {params.raxml_exe} -s {params.tempFile} -m {params.model} -n raxml.tree -T {threads} -p 235813
         mv RAxML_bestTree.raxml.tree {output}
         rm *.raxml.tree
diff --git a/workflow/scripts/reduceLen.py b/workflow/scripts/reduceLen.py
@@ -1,50 +1,127 @@
-from argparse import ArgumentParser
-import numpy as np
+import argparse
 import time
+import sys
+import gzip
+import numpy as np
+from numba import jit, prange, set_num_threads
+
+# --- Helper: Efficient FASTA Reader (Supports .gz) ---
+def read_fasta_to_numpy(filename):
+    """
+    Reads a FASTA (or .gz FASTA) file and converts it to a 2D NumPy character array.
+    """
+    headers = []
+    sequences = []
+    current_seq = []
+    
+    # Determine if we need to open with gzip or standard open
+    if filename.endswith('.gz'):
+        # 'rt' mode opens it as text, handling the decompression automatically
+        f = gzip.open(filename, 'rt')
+    else:
+        f = open(filename, 'r')
+
+    try:
+        for line in f:
+            line = line.strip()
+            if not line:
+                continue
+            if line.startswith('>'):
+                if current_seq:
+                    sequences.append("".join(current_seq))
+                    current_seq = []
+                headers.append(line)
+            else:
+                current_seq.append(line)
+        # Add the last sequence
+        if current_seq:
+            sequences.append("".join(current_seq))
+    finally:
+        f.close()
+            
+    if not sequences:
+        raise ValueError("No sequences found in input file.")
+
+    # Convert to NumPy array of characters (S1 = 1-byte string)
+    try:
+        # Use 'S1' (bytes) for performance equivalent to C++ char
+        seq_matrix = np.array([list(s) for s in sequences], dtype='S1')
+    except ValueError:
+        raise ValueError("Sequences must all be the same length for this operation.")
+        
+    return headers, seq_matrix
+
+# --- Core Logic: Parallel Filter (Numba) ---
+# nopython=True: Compile to machine code (no Python interpreter slow-down)
+# parallel=True: Enable automatic parallelization (OpenMP/TBB backend)
+@jit(nopython=True, parallel=True)
+def get_column_mask(seq_matrix, threshold):
+    n_seqs, n_cols = seq_matrix.shape
+    max_allowed_gaps = int(np.floor(threshold * n_seqs))
+    
+    keep_mask = np.zeros(n_cols, dtype=np.bool_)
+    
+    for col_idx in prange(n_cols):
+        gap_count = 0
+        for seq_idx in range(n_seqs):
+            # b'-' is the byte representation of a dash
+            if seq_matrix[seq_idx, col_idx] == b'-':
+                gap_count += 1
+        
+        if gap_count <= max_allowed_gaps:
+            keep_mask[col_idx] = True
+            
+    return keep_mask
+
+# --- Main Execution ---
+def main():
+    parser = argparse.ArgumentParser(description='Reduce alignment length to speedup tree inference process')
+    parser.add_argument('inaln', help='Input alignment (FASTA or .gz)')
+    parser.add_argument('outaln', help='Output alignment (Uncompressed FASTA)')
+    parser.add_argument('threshold', type=float, help='Minimum gap proportion for a column be removed')
+    parser.add_argument('--threads', type=int, default=1, help='Number of threads to use')
+    args = parser.parse_args()
+
+    # 1. Configure Threads
+    if args.threads > 1:
+        set_num_threads(args.threads)
+        print(f"Using {args.threads} threads.")
+
+    try:
+        print(f"Reading alignment from {args.inaln}...")
+        headers, seq_matrix = read_fasta_to_numpy(args.inaln)
+        
+        num_seqs, num_cols = seq_matrix.shape
+        print(f"Original dimensions: {num_seqs} sequences, {num_cols} columns")
+
+        # Start Timing
+        start_time = time.perf_counter()
+
+        # 2. Parallel Analysis
+        keep_mask = get_column_mask(seq_matrix, args.threshold)
+
+        # 3. Filtering (Slicing)
+        filtered_matrix = seq_matrix[:, keep_mask]
+
+        end_time = time.perf_counter()
+        elapsed_ms = (end_time - start_time) * 1000
+
+        new_cols = filtered_matrix.shape[1]
+        print(f"Original length: {num_cols}, length after removing gappy columns: {new_cols}")
+        print(f"Remove gappy columns in {elapsed_ms:.2f} ms")
+
+        # 4. Write Output (Uncompressed)
+        print(f"Writing output to {args.outaln}...")
+        with open(args.outaln, 'w') as f:
+            for i, header in enumerate(headers):
+                seq_str = filtered_matrix[i].tobytes().decode('utf-8')
+                f.write(f"{header}\n{seq_str}\n")
+        
+        print("Done.")
+
+    except Exception as e:
+        sys.stderr.write(f"Error: {e}\n")
+        sys.exit(1)
 
-parser = ArgumentParser(description='Reduce alignment length to speedup tree inference process')
-parser.add_argument('inaln', help='Input alignment')
-parser.add_argument('outaln', help='Output alignment')
-parser.add_argument('threshold', type=float, help='Minimum gap porpotion for a column be removed')
-args = parser.parse_args()
-
-st = time.time()
-
-threshold = args.threshold
-name = []
-aln = []
-
-with open(args.inaln, "r") as alnFile:
-    inContent = alnFile.read().splitlines()
-    for c in inContent:
-        if c[0] == '>':
-            name.append(c)
-        else:
-            aln.append(c)
-
-allAln = np.array([list(a) for a in aln])
-lb = len(allAln[0])
-allAln = np.transpose(allAln)
-stayedRows = []
-rowID = 0
-for row in allAln:
-    num_gap = (row == '-').sum()
-    if num_gap/len(name) <= threshold:
-        stayedRows.append(rowID)
-    rowID += 1
-newAln = []
-for r in stayedRows:
-    newAln.append(allAln[r])
-newAln = np.array(newAln)
-newAln = np.transpose(newAln)
-la = len(newAln[0])
-outFile = []
-with open(args.outaln, "w") as outFile:
-    n = 0
-    for a in newAln:
-        outFile.write(name[n]+'\n')
-        n += 1
-        outFile.write("".join(a)+'\n')
-en = time.time()
-
-print("Masked gappy site. Length before/after: "+str(lb)+"/"+str(la)+". Total time: ", en-st, "seconds.")
+if __name__ == "__main__":
+    main()