Skip to content

Extract exome #1

@mr-eyes

Description

@mr-eyes
#!/bin/bash

# ==============================================================================
# Script Name: extract_exome.sh
# Description: Extracts exonic regions (exome) from a genomic FASTA and GFF file.
#              Ensures that all sequence IDs in the GFF match those in the FASTA.
# Usage: ./extract_exome.sh genome.fna(.gz) genome.gff(.gz) [output_exome.fa]
# Dependencies: bedtools, samtools, awk, zcat, gunzip, sort, comm
# ==============================================================================

# Exit immediately if a command exits with a non-zero status
set -e

# Function to display usage instructions
usage() {
    echo "Usage: $0 <genome.fna(.gz)> <genome.gff(.gz)> [output_exome.fa]"
    echo
    echo "Arguments:"
    echo "  genome.fna(.gz)   Genomic FASTA file (.fna or .fna.gz)"
    echo "  genome.gff(.gz)   GFF annotation file (.gff or .gff.gz)"
    echo "  output_exome.fa  (Optional) Output FASTA file for exome sequences (default: exome.fa)"
    exit 1
}

# Check if at least two arguments are provided
if [ "$#" -lt 2 ]; then
    echo "Error: Insufficient arguments provided."
    usage
fi

# Assign input arguments to variables
GENOME_FNA_INPUT="$1"
GENOME_GFF_INPUT="$2"

# Assign output file name
OUTPUT_FA="${3:-exome.fa}"

# Function to check if a file is gzipped
is_gzipped() {
    local file="$1"
    if [[ "$file" == *.gz ]]; then
        return 0  # True: gzipped
    else
        return 1  # False: not gzipped
    fi
}

# Function to decompress a file if it's gzipped
decompress_if_needed() {
    local file="$1"
    local tmp_dir="$2"
    local output_var="$3"

    if is_gzipped "$file"; then
        local decompressed_file="$tmp_dir/$(basename "${file%.*}")"
        echo "Decompressing '$file' to '$decompressed_file'..."
        gunzip -c "$file" > "$decompressed_file"
        eval "$output_var='$decompressed_file'"
    else
        echo "Using uncompressed file '$file'..."
        eval "$output_var='$file'"
    fi
}

# Check if input files exist
if [ ! -f "$GENOME_FNA_INPUT" ]; then
    echo "Error: File '$GENOME_FNA_INPUT' not found."
    exit 1
fi

if [ ! -f "$GENOME_GFF_INPUT" ]; then
    echo "Error: File '$GENOME_GFF_INPUT' not found."
    exit 1
fi

# Check if required tools are installed
for cmd in bedtools samtools awk sort comm; do
    if ! command -v "$cmd" &> /dev/null; then
        echo "Error: '$cmd' is not installed. Please install it and try again."
        exit 1
    fi
done

# Determine which decompression tool to use
if command -v zcat &> /dev/null; then
    DECOMPRESS_CMD="zcat"
elif command -v gunzip &> /dev/null; then
    DECOMPRESS_CMD="gunzip -c"
else
    echo "Error: Neither 'zcat' nor 'gunzip' is available for decompression."
    exit 1
fi

# Create a temporary directory for intermediate files
TMP_DIR=$(mktemp -d)
trap "rm -rf $TMP_DIR" EXIT

echo "Temporary directory created at $TMP_DIR"

# Define intermediate file paths
EXONS_BED="$TMP_DIR/exons.bed"
DECOMPRESSED_FA="$TMP_DIR/genome.fna"
DECOMPRESSED_GFF="$TMP_DIR/genome.gff"

# ==============================================================================
# Step 1: Prepare the Genomic FASTA File
# ==============================================================================

echo "Preparing the genomic FASTA file..."

decompress_if_needed "$GENOME_FNA_INPUT" "$TMP_DIR" DECOMPRESSED_FA

# ==============================================================================
# Step 2: Prepare the GFF File
# ==============================================================================

echo "Preparing the GFF file..."

decompress_if_needed "$GENOME_GFF_INPUT" "$TMP_DIR" DECOMPRESSED_GFF

# ==============================================================================
# Step 3: Extract and Compare Sequence IDs
# ==============================================================================

echo "Extracting sequence IDs from FASTA and GFF files..."

# Extract sequence IDs from FASTA
grep '^>' "$DECOMPRESSED_FA" | awk '{print substr($1,2)}' | sort > "$TMP_DIR/fasta_ids.txt"

# Extract sequence IDs from GFF, excluding comment lines
awk '!/^#/ {print $1}' "$DECOMPRESSED_GFF" | sort | uniq > "$TMP_DIR/gff_ids.txt"

echo "Comparing sequence IDs between FASTA and GFF..."

# Find IDs in GFF not in FASTA
comm -23 "$TMP_DIR/gff_ids.txt" "$TMP_DIR/fasta_ids.txt" > "$TMP_DIR/ids_only_in_gff.txt"

# Find IDs in FASTA not in GFF
comm -13 "$TMP_DIR/gff_ids.txt" "$TMP_DIR/fasta_ids.txt" > "$TMP_DIR/ids_only_in_fasta.txt"

# Check for discrepancies
IDS_ONLY_IN_GFF=$(wc -l < "$TMP_DIR/ids_only_in_gff.txt")
IDS_ONLY_IN_FASTA=$(wc -l < "$TMP_DIR/ids_only_in_fasta.txt")

if [ "$IDS_ONLY_IN_GFF" -ne 0 ] || [ "$IDS_ONLY_IN_FASTA" -ne 0 ]; then
    echo "Error: Sequence ID mismatches detected between GFF and FASTA files."

    if [ "$IDS_ONLY_IN_GFF" -ne 0 ]; then
        echo "Sequence IDs present in GFF but NOT in FASTA ($IDS_ONLY_IN_GFF):"
        cat "$TMP_DIR/ids_only_in_gff.txt"
    fi

    if [ "$IDS_ONLY_IN_FASTA" -ne 0 ]; then
        echo "Sequence IDs present in FASTA but NOT in GFF ($IDS_ONLY_IN_FASTA):"
        cat "$TMP_DIR/ids_only_in_fasta.txt"
    fi

    echo "Please ensure that the sequence IDs in both files match exactly."
    exit 1
else
    echo "All sequence IDs in GFF match those in FASTA."
fi

# ==============================================================================
# Step 4: Index the Genomic FASTA File
# ==============================================================================

echo "Indexing the genomic FASTA file using samtools faidx..."

samtools faidx "$DECOMPRESSED_FA"

echo "FASTA indexing completed."

# ==============================================================================
# Step 5: Extract Exon Coordinates and Convert to BED Format
# ==============================================================================

echo "Extracting exon coordinates from GFF file and converting to BED format..."

awk '$3 == "exon" {print $1"\t"($4-1)"\t"$5"\t"$9"\t.\t"$7}' "$DECOMPRESSED_GFF" > "$EXONS_BED"

echo "Exon coordinates saved to $EXONS_BED"

# ==============================================================================
# Step 6: Extract Exonic Sequences using bedtools
# ==============================================================================

echo "Extracting exonic sequences using bedtools..."

bedtools getfasta \
    -fi "$DECOMPRESSED_FA" \
    -bed "$EXONS_BED" \
    -fo "$OUTPUT_FA" \
    -name  # Optional: Use BED name field as FASTA headers

echo "Exonic sequences saved to $OUTPUT_FA"

# ==============================================================================
# Completion Message
# ==============================================================================

echo "Exome extraction completed successfully."
echo "Output FASTA file: $OUTPUT_FA"

Metadata

Metadata

Assignees

No one assigned

    Labels

    No labels
    No labels

    Type

    No type

    Projects

    No projects

    Milestone

    No milestone

    Relationships

    None yet

    Development

    No branches or pull requests

    Issue actions