|
| 1 | +Generating code... |
| 2 | + |
| 3 | +I'll share updates here as I work on this. |
| 4 | + |
| 5 | +Summary of changes |
| 6 | + |
| 7 | +To optimize the bash script, I will create a new file with improved code organization, error handling, logging, and parallel processing capabilities. First, I will generate the optimized version focusing on better structure and efficiency. |
| 8 | + |
| 9 | +Files used |
| 10 | +These fields can change throughout code generation |
| 11 | + |
| 12 | +Reviewed: 0 |
| 13 | + |
| 14 | +Created: 1 |
| 15 | + |
| 16 | +optimized_processing_pipeline.sh |
| 17 | + |
| 18 | +Changed: 0 |
| 19 | + |
| 20 | +Deleted: 0 |
| 21 | + |
1 | 22 | #!/bin/bash |
2 | 23 |
|
3 | 24 | # Set strict error handling |
4 | 25 | set -euo pipefail |
5 | | -IFS=$'\n\t' |
6 | | - |
7 | | -# Default directories |
8 | | -INPUT_DIR="/home/ramsivakumar/nextflow_conversion/fastq" |
9 | | -OUTPUT_DIR="/home/ramsivakumar/nextflow_conversion/test_out_bash" |
10 | | - |
11 | | -# Function to display usage information |
12 | | -usage() { |
13 | | - echo "Usage: $0 [-i INPUT_DIR] [-o OUTPUT_DIR] [-h]" |
14 | | - echo "Options:" |
15 | | - echo " -i INPUT_DIR Directory containing input FASTQ files (default: $INPUT_DIR)" |
16 | | - echo " -o OUTPUT_DIR Output directory for results (default: $OUTPUT_DIR)" |
17 | | - echo " -h Display this help message" |
18 | | - exit 1 |
| 26 | + |
| 27 | +# Configuration |
| 28 | +INPUT_DIR="/mnt/c/Users/ramsivakumar/chatbot_testing/data" |
| 29 | +OUTPUT_DIR="/mnt/c/Users/ramsivakumar/chatbot_testing/bacass_optim_out" |
| 30 | +LOG_FILE="${OUTPUT_DIR}/pipeline.log" |
| 31 | +MAX_PARALLEL_JOBS=4 # Adjust based on available resources |
| 32 | + |
| 33 | +# Function to log messages with timestamps |
| 34 | +log_message() { |
| 35 | + echo "[$(date '+%Y-%m-%d %H:%M:%S')] $1" | tee -a "$LOG_FILE" |
19 | 36 | } |
20 | 37 |
|
21 | | -# Function to check if required tools are installed |
| 38 | +# Function to check required tools |
22 | 39 | check_dependencies() { |
23 | 40 | local required_tools=("fastp" "fastqc" "multiqc" "unicycler") |
24 | | - local missing_tools=() |
25 | | - |
26 | 41 | for tool in "${required_tools[@]}"; do |
27 | 42 | if ! command -v "$tool" &> /dev/null; then |
28 | | - missing_tools+=("$tool") |
| 43 | + log_message "ERROR: Required tool '$tool' not found. Please install it and try again." |
| 44 | + exit 1 |
29 | 45 | fi |
30 | 46 | done |
31 | | - |
32 | | - if [ ${#missing_tools[@]} -ne 0 ]; then |
33 | | - echo "Error: The following required tools are missing:" |
34 | | - printf '%s\n' "${missing_tools[@]}" |
35 | | - exit 1 |
36 | | - fi |
37 | 47 | } |
38 | 48 |
|
39 | 49 | # Function to create output directories |
40 | | -create_output_dirs() { |
| 50 | +create_directories() { |
41 | 51 | local dirs=("fastp_output" "fastqc_output" "unicycler_output" "multiqc_output") |
42 | 52 | for dir in "${dirs[@]}"; do |
43 | | - mkdir -p "$OUTPUT_DIR/$dir" |
| 53 | + mkdir -p "${OUTPUT_DIR}/${dir}" |
44 | 54 | done |
| 55 | + log_message "Created output directories" |
45 | 56 | } |
46 | 57 |
|
47 | 58 | # Function to process a single sample |
48 | 59 | process_sample() { |
49 | 60 | local sample_name="$1" |
50 | | - local fastq_file_r1="$INPUT_DIR/${sample_name}_R1_001.fastq.gz" |
51 | | - local fastq_file_r2="$INPUT_DIR/${sample_name}_R2_001.fastq.gz" |
52 | | - |
53 | | - # Check if input files exist |
54 | | - for file in "$fastq_file_r1" "$fastq_file_r2"; do |
55 | | - if [[ ! -f "$file" ]]; then |
56 | | - echo "Error: Input file $file not found" |
57 | | - return 1 |
58 | | - fi |
59 | | - done |
60 | | - |
61 | | - echo "Processing sample: $sample_name" |
62 | | - |
63 | | - # Step 1: Run fastp with error handling |
64 | | - local fastp_output_r1="$OUTPUT_DIR/fastp_output/${sample_name}_R1.fastp.fastq" |
65 | | - local fastp_output_r2="$OUTPUT_DIR/fastp_output/${sample_name}_R2.fastp.fastq" |
66 | | - local fastp_json="$OUTPUT_DIR/fastp_output/${sample_name}.json" |
67 | | - local fastp_html="$OUTPUT_DIR/fastp_output/${sample_name}.html" |
68 | | - |
69 | | - if ! fastp -i "$fastq_file_r1" -I "$fastq_file_r2" \ |
70 | | - -o "$fastp_output_r1" -O "$fastp_output_r2" \ |
71 | | - -j "$fastp_json" -h "$fastp_html" \ |
72 | | - --detect_adapter_for_pe \ |
73 | | - --thread 8; then |
74 | | - echo "Error: fastp processing failed for $sample_name" |
| 61 | + local fastq_r1="$2" |
| 62 | + local fastq_r2="$3" |
| 63 | + |
| 64 | + log_message "Starting processing of sample: ${sample_name}" |
| 65 | + |
| 66 | + # Create sample-specific output directories |
| 67 | + local fastqc_output_dir="${OUTPUT_DIR}/fastqc_output/${sample_name}_fastqc" |
| 68 | + local unicycler_output_dir="${OUTPUT_DIR}/unicycler_output/${sample_name}_unicycler" |
| 69 | + mkdir -p "$fastqc_output_dir" "$unicycler_output_dir" |
| 70 | + |
| 71 | + # Step 1: Run fastp |
| 72 | + local fastp_output_r1="${OUTPUT_DIR}/fastp_output/${sample_name}_R1.fastp.fastq" |
| 73 | + local fastp_output_r2="${OUTPUT_DIR}/fastp_output/${sample_name}_R2.fastp.fastq" |
| 74 | + log_message "Running fastp for ${sample_name}" |
| 75 | + if ! fastp -i "$fastq_r1" -I "$fastq_r2" -o "$fastp_output_r1" -O "$fastp_output_r2" --json "${OUTPUT_DIR}/fastp_output/${sample_name}_fastp.json" --html "${OUTPUT_DIR}/fastp_output/${sample_name}_fastp.html" 2>> "$LOG_FILE"; then |
| 76 | + log_message "ERROR: fastp failed for ${sample_name}" |
75 | 77 | return 1 |
76 | 78 | fi |
77 | | - |
78 | | - # Step 2: Run FastQC with error handling |
79 | | - local fastqc_output_dir="$OUTPUT_DIR/fastqc_output/${sample_name}_fastqc" |
80 | | - mkdir -p "$fastqc_output_dir" |
81 | | - |
82 | | - if ! fastqc "$fastp_output_r1" "$fastp_output_r2" \ |
83 | | - -o "$fastqc_output_dir" \ |
84 | | - -t 8; then |
85 | | - echo "Error: FastQC analysis failed for $sample_name" |
| 79 | + |
| 80 | + # Step 2: Run fastqc |
| 81 | + log_message "Running FastQC for ${sample_name}" |
| 82 | + if ! fastqc "$fastp_output_r1" "$fastp_output_r2" -o "$fastqc_output_dir" --quiet 2>> "$LOG_FILE"; then |
| 83 | + log_message "ERROR: FastQC failed for ${sample_name}" |
86 | 84 | return 1 |
87 | 85 | fi |
88 | | - |
89 | | - # Step 3: Run Unicycler with error handling |
90 | | - local unicycler_output_dir="$OUTPUT_DIR/unicycler_output/${sample_name}_unicycler" |
91 | | - mkdir -p "$unicycler_output_dir" |
92 | | - |
93 | | - if ! unicycler -1 "$fastp_output_r1" -2 "$fastp_output_r2" \ |
94 | | - -o "$unicycler_output_dir" \ |
95 | | - --threads 8; then |
96 | | - echo "Error: Unicycler assembly failed for $sample_name" |
| 86 | + |
| 87 | + # Step 3: Run unicycler |
| 88 | + log_message "Running Unicycler for ${sample_name}" |
| 89 | + if ! unicycler -1 "$fastp_output_r1" -2 "$fastp_output_r2" -o "$unicycler_output_dir" --verbosity 0 2>> "$LOG_FILE"; then |
| 90 | + log_message "ERROR: Unicycler failed for ${sample_name}" |
97 | 91 | return 1 |
98 | 92 | fi |
99 | | - |
100 | | - echo "Successfully processed sample: $sample_name" |
| 93 | + |
| 94 | + log_message "Completed processing of sample: ${sample_name}" |
101 | 95 | return 0 |
102 | 96 | } |
103 | 97 |
|
104 | | -# Parse command line arguments |
105 | | -while getopts "i:o:h" opt; do |
106 | | - case $opt in |
107 | | - i) INPUT_DIR="$OPTARG" ;; |
108 | | - o) OUTPUT_DIR="$OPTARG" ;; |
109 | | - h) usage ;; |
110 | | - ?) usage ;; |
111 | | - esac |
112 | | -done |
113 | | - |
114 | | -# Validate input and output directories |
115 | | -if [[ ! -d "$INPUT_DIR" ]]; then |
116 | | - echo "Error: Input directory $INPUT_DIR does not exist" |
117 | | - exit 1 |
118 | | -fi |
119 | | - |
120 | | -# Check dependencies |
121 | | -check_dependencies |
122 | | - |
123 | | -# Create output directories |
124 | | -create_output_dirs |
125 | | - |
126 | | -# Process all samples |
127 | | -echo "Starting pipeline execution..." |
128 | | -failed_samples=() |
129 | | - |
130 | | -# Find all R1 files and extract sample names |
131 | | -for fastq_file_r1 in "$INPUT_DIR"/*_R1_001.fastq.gz; do |
132 | | - if [[ -f "$fastq_file_r1" ]]; then |
133 | | - sample_name=$(basename "$fastq_file_r1" _R1_001.fastq.gz) |
134 | | - if ! process_sample "$sample_name"; then |
135 | | - failed_samples+=("$sample_name") |
| 98 | +main() { |
| 99 | + log_message "Starting pipeline" |
| 100 | + |
| 101 | + # Check dependencies |
| 102 | + check_dependencies |
| 103 | + |
| 104 | + # Create output directories |
| 105 | + create_directories |
| 106 | + |
| 107 | + # Process samples in parallel |
| 108 | + local pids=() |
| 109 | + local sample_count=0 |
| 110 | + |
| 111 | + for fastq_file_r1 in "$INPUT_DIR"/*_1.fastq.gz; do |
| 112 | + sample_name=$(basename "$fastq_file_r1" "_1.fastq.gz") |
| 113 | + fastq_file_r2="${INPUT_DIR}/${sample_name}_2.fastq.gz" |
| 114 | + |
| 115 | + # Check if R2 file exists |
| 116 | + if [[ ! -f "$fastq_file_r2" ]]; then |
| 117 | + log_message "Warning: Corresponding R2 file for $fastq_file_r1 not found. Skipping this pair." |
| 118 | + continue |
| 119 | + fi |
| 120 | + |
| 121 | + # Process sample in background |
| 122 | + process_sample "$sample_name" "$fastq_file_r1" "$fastq_file_r2" & |
| 123 | + pids+=($!) |
| 124 | + ((sample_count++)) |
| 125 | + |
| 126 | + # Limit parallel jobs |
| 127 | + if ((${#pids[@]} >= MAX_PARALLEL_JOBS)); then |
| 128 | + wait "${pids[0]}" |
| 129 | + pids=("${pids[@]:1}") |
136 | 130 | fi |
| 131 | + done |
| 132 | + |
| 133 | + # Wait for remaining jobs |
| 134 | + wait |
| 135 | + |
| 136 | + # Run multiqc on all results |
| 137 | + log_message "Running MultiQC on all results" |
| 138 | + if ! multiqc "$OUTPUT_DIR" -o "${OUTPUT_DIR}/multiqc_output" 2>> "$LOG_FILE"; then |
| 139 | + log_message "ERROR: MultiQC failed" |
| 140 | + exit 1 |
137 | 141 | fi |
138 | | -done |
139 | | - |
140 | | -# Run MultiQC on all FastQC results |
141 | | -echo "Running MultiQC on all results..." |
142 | | -multiqc_output_dir="$OUTPUT_DIR/multiqc_output" |
143 | | -if ! multiqc "$OUTPUT_DIR/fastqc_output" -o "$multiqc_output_dir"; then |
144 | | - echo "Warning: MultiQC analysis failed" |
145 | | -fi |
146 | | - |
147 | | -# Print summary |
148 | | -echo "Pipeline execution completed" |
149 | | -if [ ${#failed_samples[@]} -eq 0 ]; then |
150 | | - echo "All samples processed successfully" |
151 | | -else |
152 | | - echo "The following samples failed:" |
153 | | - printf '%s\n' "${failed_samples[@]}" |
154 | | - exit 1 |
155 | | -fi |
| 142 | + |
| 143 | + log_message "Pipeline completed successfully. Processed ${sample_count} samples." |
| 144 | +} |
| 145 | + |
| 146 | +# Run the main function |
| 147 | +main |
0 commit comments