This repository provides a reproducible, parallelized pipeline for quantifying transposable elements and gene expression from RNA-seq BAM files using TEtranscripts.
- Automated installation of dependencies (Conda, Python, R, TEtranscripts)
- Parallel processing of multiple BAM files using
GNU parallel - TE and gene expression quantification in a single output directory
- Bash shell
wget,curl,parallel,unzip- Internet access for downloading dependencies
##Clone the GitHub Repository###
git clone https://github.com/Ali-mhm/TEtranscripts_Multi_thread.git
cd TEtranscripts-pipeline
##Make the Script Executable###
chmod +x run_TEtranscripts_pipeline.sh
##(Optional) Edit Configuration##
ENV_NAME="te_env" # Conda environment name
THREADS=4 # Number of threads for parallel processing
BAM_DIR=~/testdata_PE # Directory containing your BAM files
OUTPUT_DIR=~/output1 # Output directory
##Run the Pipeline##
./run_TEtranscripts_pipeline.sh
Install GNU parallel
echo "Installing GNU parallel..."
sudo apt-get update -qq
sudo apt-get install -y parallel
Install Anaconda
wget https://repo.anaconda.com/archive/Anaconda3-2024.02-1-Linux-x86_64.sh
bash Anaconda3-2024.02-1-Linux-x86_64.sh -b -p $HOME/anaconda3
conda install -c conda-forge mamba
Create and activate conda environment
mamba create -n tetranscript python=3.7 -y
conda activate tetranscript
Install dependencies
mamba install -c bioconda samtools bedtools -y
mamba install -c conda-forge r-base=4.4.0 -y
Install TEtranscripts
wget https://github.com/mhammell-laboratory/TEtranscripts/archive/refs/heads/master.zip
unzip master.zip
cd TEtranscripts-master
pip install .
cd ..
Download TE GTF
curl -L -o TE_GTF.zip "https://www.dropbox.com/scl/fo/jdpgn6fl8ngd3th3zebap/ACdZkShDC1au-OckIipI5kM/TEtranscripts/TE_GTF?rlkey=41oz6ppggy82uha5i3yo1rnlx&dl=1"
unzip TE_GTF.zip -d TE_GTF
gunzip TE_GTF/*.gz
Download and unzip GENCODE annotation
wget https://ftp.ebi.ac.uk/pub/databases/gencode/Gencode_human/release_44/gencode.v44.annotation.gtf.gz
gunzip gencode.v44.annotation.gtf.gz
check the chromosome name style
head gencode.v44.annotation.gtf | cut -f1 | sort | uniq
head hg38_rmsk_TE.gtf | cut -f1 | sort | uniq
Fix chromosome names (if needed)
sed 's/^chr//' TE_GTF/hg38_rmsk_TE.gtf > TE_GTF/hg38_rmsk_TE.fixed.gtf
sed 's/^chr//' gencode.v44.annotation.gtf > gencode.v44.annotation.fixed.gtf
Define TEcount wrapper
run_tecount() {
local BAM_FILE="$1"
local SAMPLE_NAME
SAMPLE_NAME=$(basename "$BAM_FILE" .bam)
echo "Running TEcount for $SAMPLE_NAME..."
TEcount \
-b "$BAM_FILE" \
--GTF "$GTF" \
--TE "$TE_GTF" \
--format BAM \
--stranded no \
--mode multi \
--project "$SAMPLE_NAME" \
--outdir "$OUTPUT_DIR" \
--verbose 3
}
export -f run_tecount
Run TEcount in parallel for all BAM files
find "$BAM_DIR" -name "*.bam" | parallel -j 4 run_tecount