-
Notifications
You must be signed in to change notification settings - Fork 0
Open
Description
#!/bin/bash
# ==============================================================================
# Script Name: extract_exome.sh
# Description: Extracts exonic regions (exome) from a genomic FASTA and GFF file.
# Ensures that all sequence IDs in the GFF match those in the FASTA.
# Usage: ./extract_exome.sh genome.fna(.gz) genome.gff(.gz) [output_exome.fa]
# Dependencies: bedtools, samtools, awk, zcat, gunzip, sort, comm
# ==============================================================================
# Exit immediately if a command exits with a non-zero status
set -e
# Function to display usage instructions
usage() {
echo "Usage: $0 <genome.fna(.gz)> <genome.gff(.gz)> [output_exome.fa]"
echo
echo "Arguments:"
echo " genome.fna(.gz) Genomic FASTA file (.fna or .fna.gz)"
echo " genome.gff(.gz) GFF annotation file (.gff or .gff.gz)"
echo " output_exome.fa (Optional) Output FASTA file for exome sequences (default: exome.fa)"
exit 1
}
# Check if at least two arguments are provided
if [ "$#" -lt 2 ]; then
echo "Error: Insufficient arguments provided."
usage
fi
# Assign input arguments to variables
GENOME_FNA_INPUT="$1"
GENOME_GFF_INPUT="$2"
# Assign output file name
OUTPUT_FA="${3:-exome.fa}"
# Function to check if a file is gzipped
is_gzipped() {
local file="$1"
if [[ "$file" == *.gz ]]; then
return 0 # True: gzipped
else
return 1 # False: not gzipped
fi
}
# Function to decompress a file if it's gzipped
decompress_if_needed() {
local file="$1"
local tmp_dir="$2"
local output_var="$3"
if is_gzipped "$file"; then
local decompressed_file="$tmp_dir/$(basename "${file%.*}")"
echo "Decompressing '$file' to '$decompressed_file'..."
gunzip -c "$file" > "$decompressed_file"
eval "$output_var='$decompressed_file'"
else
echo "Using uncompressed file '$file'..."
eval "$output_var='$file'"
fi
}
# Check if input files exist
if [ ! -f "$GENOME_FNA_INPUT" ]; then
echo "Error: File '$GENOME_FNA_INPUT' not found."
exit 1
fi
if [ ! -f "$GENOME_GFF_INPUT" ]; then
echo "Error: File '$GENOME_GFF_INPUT' not found."
exit 1
fi
# Check if required tools are installed
for cmd in bedtools samtools awk sort comm; do
if ! command -v "$cmd" &> /dev/null; then
echo "Error: '$cmd' is not installed. Please install it and try again."
exit 1
fi
done
# Determine which decompression tool to use
if command -v zcat &> /dev/null; then
DECOMPRESS_CMD="zcat"
elif command -v gunzip &> /dev/null; then
DECOMPRESS_CMD="gunzip -c"
else
echo "Error: Neither 'zcat' nor 'gunzip' is available for decompression."
exit 1
fi
# Create a temporary directory for intermediate files
TMP_DIR=$(mktemp -d)
trap "rm -rf $TMP_DIR" EXIT
echo "Temporary directory created at $TMP_DIR"
# Define intermediate file paths
EXONS_BED="$TMP_DIR/exons.bed"
DECOMPRESSED_FA="$TMP_DIR/genome.fna"
DECOMPRESSED_GFF="$TMP_DIR/genome.gff"
# ==============================================================================
# Step 1: Prepare the Genomic FASTA File
# ==============================================================================
echo "Preparing the genomic FASTA file..."
decompress_if_needed "$GENOME_FNA_INPUT" "$TMP_DIR" DECOMPRESSED_FA
# ==============================================================================
# Step 2: Prepare the GFF File
# ==============================================================================
echo "Preparing the GFF file..."
decompress_if_needed "$GENOME_GFF_INPUT" "$TMP_DIR" DECOMPRESSED_GFF
# ==============================================================================
# Step 3: Extract and Compare Sequence IDs
# ==============================================================================
echo "Extracting sequence IDs from FASTA and GFF files..."
# Extract sequence IDs from FASTA
grep '^>' "$DECOMPRESSED_FA" | awk '{print substr($1,2)}' | sort > "$TMP_DIR/fasta_ids.txt"
# Extract sequence IDs from GFF, excluding comment lines
awk '!/^#/ {print $1}' "$DECOMPRESSED_GFF" | sort | uniq > "$TMP_DIR/gff_ids.txt"
echo "Comparing sequence IDs between FASTA and GFF..."
# Find IDs in GFF not in FASTA
comm -23 "$TMP_DIR/gff_ids.txt" "$TMP_DIR/fasta_ids.txt" > "$TMP_DIR/ids_only_in_gff.txt"
# Find IDs in FASTA not in GFF
comm -13 "$TMP_DIR/gff_ids.txt" "$TMP_DIR/fasta_ids.txt" > "$TMP_DIR/ids_only_in_fasta.txt"
# Check for discrepancies
IDS_ONLY_IN_GFF=$(wc -l < "$TMP_DIR/ids_only_in_gff.txt")
IDS_ONLY_IN_FASTA=$(wc -l < "$TMP_DIR/ids_only_in_fasta.txt")
if [ "$IDS_ONLY_IN_GFF" -ne 0 ] || [ "$IDS_ONLY_IN_FASTA" -ne 0 ]; then
echo "Error: Sequence ID mismatches detected between GFF and FASTA files."
if [ "$IDS_ONLY_IN_GFF" -ne 0 ]; then
echo "Sequence IDs present in GFF but NOT in FASTA ($IDS_ONLY_IN_GFF):"
cat "$TMP_DIR/ids_only_in_gff.txt"
fi
if [ "$IDS_ONLY_IN_FASTA" -ne 0 ]; then
echo "Sequence IDs present in FASTA but NOT in GFF ($IDS_ONLY_IN_FASTA):"
cat "$TMP_DIR/ids_only_in_fasta.txt"
fi
echo "Please ensure that the sequence IDs in both files match exactly."
exit 1
else
echo "All sequence IDs in GFF match those in FASTA."
fi
# ==============================================================================
# Step 4: Index the Genomic FASTA File
# ==============================================================================
echo "Indexing the genomic FASTA file using samtools faidx..."
samtools faidx "$DECOMPRESSED_FA"
echo "FASTA indexing completed."
# ==============================================================================
# Step 5: Extract Exon Coordinates and Convert to BED Format
# ==============================================================================
echo "Extracting exon coordinates from GFF file and converting to BED format..."
awk '$3 == "exon" {print $1"\t"($4-1)"\t"$5"\t"$9"\t.\t"$7}' "$DECOMPRESSED_GFF" > "$EXONS_BED"
echo "Exon coordinates saved to $EXONS_BED"
# ==============================================================================
# Step 6: Extract Exonic Sequences using bedtools
# ==============================================================================
echo "Extracting exonic sequences using bedtools..."
bedtools getfasta \
-fi "$DECOMPRESSED_FA" \
-bed "$EXONS_BED" \
-fo "$OUTPUT_FA" \
-name # Optional: Use BED name field as FASTA headers
echo "Exonic sequences saved to $OUTPUT_FA"
# ==============================================================================
# Completion Message
# ==============================================================================
echo "Exome extraction completed successfully."
echo "Output FASTA file: $OUTPUT_FA"Reactions are currently unavailable
Metadata
Metadata
Assignees
Labels
No labels