-
Notifications
You must be signed in to change notification settings - Fork 4
Expand file tree
/
Copy pathgenerate_stats.sh
More file actions
57 lines (48 loc) · 1.55 KB
/
generate_stats.sh
File metadata and controls
57 lines (48 loc) · 1.55 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
#!/bin/bash -l
# author: Lucas Patel
# date: $(date +%Y-%m-%d)
# description: Generate directory-wise summary statistics for .fastq and .fastq.gz files
# Check for the correct number of arguments
#if [ "$#" -ne 4 ]; then
# echo "Usage: $0 <directory> <identifier> <X> <Y>"
# exit 1
#fi
directory=$1
identifier=$2
X=$3
Y=$4
output_file="${directory}/summary_statistics.tsv"
# Check if X and Y are provided and numeric
if [[ -n $X && -n $Y && $X =~ ^[0-9]+$ && $Y =~ ^[0-9]+$ ]]; then
compute_confusion_matrix=true
echo -e "file\tkey\tline count\thuman count\tmicrobe count\tTP\tFP\tTN\tFN" > "$output_file"
else
compute_confusion_matrix=false
echo -e "file\tkey\tline count" > "$output_file"
fi
# Function to calculate counts
calculate_counts() {
file=$1
if [[ $file == *.gz ]]; then
zgrep -c "$2" "$file"
else
grep -c "$2" "$file"
fi
}
# Loop over all .fastq and .fastq.gz files in the directory
for file in "${directory}"/*.{fastq,fastq.gz}; do
if [[ -f "$file" ]]; then
line_count=$(calculate_counts "$file" "")
human_count=$(calculate_counts "$file" "HUMAN")
microbe_count=$(calculate_counts "$file" "MICROBE")
if [ "$compute_confusion_matrix" = true ]; then
TP=$((X - human_count))
FP=$((Y - microbe_count))
TN=$microbe_count
FN=$human_count
echo -e "$(realpath "$file")\t${identifier}\t${line_count}\t${human_count}\t${microbe_count}\t${TP}\t${FP}\t${TN}\t${FN}" >> "$output_file"
else
echo -e "$(realpath "$file")\t${identifier}\t${line_count}" >> "$output_file"
fi
fi
done