Skip to content

Commit dff3687

Browse files
Update code_optim
1 parent 0c16c76 commit dff3687

File tree

1 file changed

+113
-121
lines changed
  • docs/chatbot_comparison/results/amazon_q

1 file changed

+113
-121
lines changed
Lines changed: 113 additions & 121 deletions
Original file line numberDiff line numberDiff line change
@@ -1,155 +1,147 @@
1+
Generating code...
2+
3+
I'll share updates here as I work on this.
4+
5+
Summary of changes
6+
7+
To optimize the bash script, I will create a new file with improved code organization, error handling, logging, and parallel processing capabilities. First, I will generate the optimized version focusing on better structure and efficiency.
8+
9+
Files used
10+
These fields can change throughout code generation
11+
12+
Reviewed: 0
13+
14+
Created: 1
15+
16+
optimized_processing_pipeline.sh
17+
18+
Changed: 0
19+
20+
Deleted: 0
21+
122
#!/bin/bash
223

324
# Set strict error handling
425
set -euo pipefail
5-
IFS=$'\n\t'
6-
7-
# Default directories
8-
INPUT_DIR="/home/ramsivakumar/nextflow_conversion/fastq"
9-
OUTPUT_DIR="/home/ramsivakumar/nextflow_conversion/test_out_bash"
10-
11-
# Function to display usage information
12-
usage() {
13-
echo "Usage: $0 [-i INPUT_DIR] [-o OUTPUT_DIR] [-h]"
14-
echo "Options:"
15-
echo " -i INPUT_DIR Directory containing input FASTQ files (default: $INPUT_DIR)"
16-
echo " -o OUTPUT_DIR Output directory for results (default: $OUTPUT_DIR)"
17-
echo " -h Display this help message"
18-
exit 1
26+
27+
# Configuration
28+
INPUT_DIR="/mnt/c/Users/ramsivakumar/chatbot_testing/data"
29+
OUTPUT_DIR="/mnt/c/Users/ramsivakumar/chatbot_testing/bacass_optim_out"
30+
LOG_FILE="${OUTPUT_DIR}/pipeline.log"
31+
MAX_PARALLEL_JOBS=4 # Adjust based on available resources
32+
33+
# Function to log messages with timestamps
34+
log_message() {
35+
echo "[$(date '+%Y-%m-%d %H:%M:%S')] $1" | tee -a "$LOG_FILE"
1936
}
2037

21-
# Function to check if required tools are installed
38+
# Function to check required tools
2239
check_dependencies() {
2340
local required_tools=("fastp" "fastqc" "multiqc" "unicycler")
24-
local missing_tools=()
25-
2641
for tool in "${required_tools[@]}"; do
2742
if ! command -v "$tool" &> /dev/null; then
28-
missing_tools+=("$tool")
43+
log_message "ERROR: Required tool '$tool' not found. Please install it and try again."
44+
exit 1
2945
fi
3046
done
31-
32-
if [ ${#missing_tools[@]} -ne 0 ]; then
33-
echo "Error: The following required tools are missing:"
34-
printf '%s\n' "${missing_tools[@]}"
35-
exit 1
36-
fi
3747
}
3848

3949
# Function to create output directories
40-
create_output_dirs() {
50+
create_directories() {
4151
local dirs=("fastp_output" "fastqc_output" "unicycler_output" "multiqc_output")
4252
for dir in "${dirs[@]}"; do
43-
mkdir -p "$OUTPUT_DIR/$dir"
53+
mkdir -p "${OUTPUT_DIR}/${dir}"
4454
done
55+
log_message "Created output directories"
4556
}
4657

4758
# Function to process a single sample
4859
process_sample() {
4960
local sample_name="$1"
50-
local fastq_file_r1="$INPUT_DIR/${sample_name}_R1_001.fastq.gz"
51-
local fastq_file_r2="$INPUT_DIR/${sample_name}_R2_001.fastq.gz"
52-
53-
# Check if input files exist
54-
for file in "$fastq_file_r1" "$fastq_file_r2"; do
55-
if [[ ! -f "$file" ]]; then
56-
echo "Error: Input file $file not found"
57-
return 1
58-
fi
59-
done
60-
61-
echo "Processing sample: $sample_name"
62-
63-
# Step 1: Run fastp with error handling
64-
local fastp_output_r1="$OUTPUT_DIR/fastp_output/${sample_name}_R1.fastp.fastq"
65-
local fastp_output_r2="$OUTPUT_DIR/fastp_output/${sample_name}_R2.fastp.fastq"
66-
local fastp_json="$OUTPUT_DIR/fastp_output/${sample_name}.json"
67-
local fastp_html="$OUTPUT_DIR/fastp_output/${sample_name}.html"
68-
69-
if ! fastp -i "$fastq_file_r1" -I "$fastq_file_r2" \
70-
-o "$fastp_output_r1" -O "$fastp_output_r2" \
71-
-j "$fastp_json" -h "$fastp_html" \
72-
--detect_adapter_for_pe \
73-
--thread 8; then
74-
echo "Error: fastp processing failed for $sample_name"
61+
local fastq_r1="$2"
62+
local fastq_r2="$3"
63+
64+
log_message "Starting processing of sample: ${sample_name}"
65+
66+
# Create sample-specific output directories
67+
local fastqc_output_dir="${OUTPUT_DIR}/fastqc_output/${sample_name}_fastqc"
68+
local unicycler_output_dir="${OUTPUT_DIR}/unicycler_output/${sample_name}_unicycler"
69+
mkdir -p "$fastqc_output_dir" "$unicycler_output_dir"
70+
71+
# Step 1: Run fastp
72+
local fastp_output_r1="${OUTPUT_DIR}/fastp_output/${sample_name}_R1.fastp.fastq"
73+
local fastp_output_r2="${OUTPUT_DIR}/fastp_output/${sample_name}_R2.fastp.fastq"
74+
log_message "Running fastp for ${sample_name}"
75+
if ! fastp -i "$fastq_r1" -I "$fastq_r2" -o "$fastp_output_r1" -O "$fastp_output_r2" --json "${OUTPUT_DIR}/fastp_output/${sample_name}_fastp.json" --html "${OUTPUT_DIR}/fastp_output/${sample_name}_fastp.html" 2>> "$LOG_FILE"; then
76+
log_message "ERROR: fastp failed for ${sample_name}"
7577
return 1
7678
fi
77-
78-
# Step 2: Run FastQC with error handling
79-
local fastqc_output_dir="$OUTPUT_DIR/fastqc_output/${sample_name}_fastqc"
80-
mkdir -p "$fastqc_output_dir"
81-
82-
if ! fastqc "$fastp_output_r1" "$fastp_output_r2" \
83-
-o "$fastqc_output_dir" \
84-
-t 8; then
85-
echo "Error: FastQC analysis failed for $sample_name"
79+
80+
# Step 2: Run fastqc
81+
log_message "Running FastQC for ${sample_name}"
82+
if ! fastqc "$fastp_output_r1" "$fastp_output_r2" -o "$fastqc_output_dir" --quiet 2>> "$LOG_FILE"; then
83+
log_message "ERROR: FastQC failed for ${sample_name}"
8684
return 1
8785
fi
88-
89-
# Step 3: Run Unicycler with error handling
90-
local unicycler_output_dir="$OUTPUT_DIR/unicycler_output/${sample_name}_unicycler"
91-
mkdir -p "$unicycler_output_dir"
92-
93-
if ! unicycler -1 "$fastp_output_r1" -2 "$fastp_output_r2" \
94-
-o "$unicycler_output_dir" \
95-
--threads 8; then
96-
echo "Error: Unicycler assembly failed for $sample_name"
86+
87+
# Step 3: Run unicycler
88+
log_message "Running Unicycler for ${sample_name}"
89+
if ! unicycler -1 "$fastp_output_r1" -2 "$fastp_output_r2" -o "$unicycler_output_dir" --verbosity 0 2>> "$LOG_FILE"; then
90+
log_message "ERROR: Unicycler failed for ${sample_name}"
9791
return 1
9892
fi
99-
100-
echo "Successfully processed sample: $sample_name"
93+
94+
log_message "Completed processing of sample: ${sample_name}"
10195
return 0
10296
}
10397

104-
# Parse command line arguments
105-
while getopts "i:o:h" opt; do
106-
case $opt in
107-
i) INPUT_DIR="$OPTARG" ;;
108-
o) OUTPUT_DIR="$OPTARG" ;;
109-
h) usage ;;
110-
?) usage ;;
111-
esac
112-
done
113-
114-
# Validate input and output directories
115-
if [[ ! -d "$INPUT_DIR" ]]; then
116-
echo "Error: Input directory $INPUT_DIR does not exist"
117-
exit 1
118-
fi
119-
120-
# Check dependencies
121-
check_dependencies
122-
123-
# Create output directories
124-
create_output_dirs
125-
126-
# Process all samples
127-
echo "Starting pipeline execution..."
128-
failed_samples=()
129-
130-
# Find all R1 files and extract sample names
131-
for fastq_file_r1 in "$INPUT_DIR"/*_R1_001.fastq.gz; do
132-
if [[ -f "$fastq_file_r1" ]]; then
133-
sample_name=$(basename "$fastq_file_r1" _R1_001.fastq.gz)
134-
if ! process_sample "$sample_name"; then
135-
failed_samples+=("$sample_name")
98+
main() {
99+
log_message "Starting pipeline"
100+
101+
# Check dependencies
102+
check_dependencies
103+
104+
# Create output directories
105+
create_directories
106+
107+
# Process samples in parallel
108+
local pids=()
109+
local sample_count=0
110+
111+
for fastq_file_r1 in "$INPUT_DIR"/*_1.fastq.gz; do
112+
sample_name=$(basename "$fastq_file_r1" "_1.fastq.gz")
113+
fastq_file_r2="${INPUT_DIR}/${sample_name}_2.fastq.gz"
114+
115+
# Check if R2 file exists
116+
if [[ ! -f "$fastq_file_r2" ]]; then
117+
log_message "Warning: Corresponding R2 file for $fastq_file_r1 not found. Skipping this pair."
118+
continue
119+
fi
120+
121+
# Process sample in background
122+
process_sample "$sample_name" "$fastq_file_r1" "$fastq_file_r2" &
123+
pids+=($!)
124+
((sample_count++))
125+
126+
# Limit parallel jobs
127+
if ((${#pids[@]} >= MAX_PARALLEL_JOBS)); then
128+
wait "${pids[0]}"
129+
pids=("${pids[@]:1}")
136130
fi
131+
done
132+
133+
# Wait for remaining jobs
134+
wait
135+
136+
# Run multiqc on all results
137+
log_message "Running MultiQC on all results"
138+
if ! multiqc "$OUTPUT_DIR" -o "${OUTPUT_DIR}/multiqc_output" 2>> "$LOG_FILE"; then
139+
log_message "ERROR: MultiQC failed"
140+
exit 1
137141
fi
138-
done
139-
140-
# Run MultiQC on all FastQC results
141-
echo "Running MultiQC on all results..."
142-
multiqc_output_dir="$OUTPUT_DIR/multiqc_output"
143-
if ! multiqc "$OUTPUT_DIR/fastqc_output" -o "$multiqc_output_dir"; then
144-
echo "Warning: MultiQC analysis failed"
145-
fi
146-
147-
# Print summary
148-
echo "Pipeline execution completed"
149-
if [ ${#failed_samples[@]} -eq 0 ]; then
150-
echo "All samples processed successfully"
151-
else
152-
echo "The following samples failed:"
153-
printf '%s\n' "${failed_samples[@]}"
154-
exit 1
155-
fi
142+
143+
log_message "Pipeline completed successfully. Processed ${sample_count} samples."
144+
}
145+
146+
# Run the main function
147+
main

0 commit comments

Comments
 (0)