LANL-Bioinformatics
diff --git a/‎workflows/Nextflow/modules/runFaQCs/resources/usr/bin/faqcs_ATGCcomposition.py‎
Lines changed: 135 additions & 0 deletions b/‎workflows/Nextflow/modules/runFaQCs/resources/usr/bin/faqcs_ATGCcomposition.py‎
Lines changed: 135 additions & 0 deletions
diff --git a/‎workflows/Nextflow/modules/runFaQCs/resources/usr/bin/faqcs_ATGCcontent.py‎
Lines changed: 98 additions & 0 deletions b/‎workflows/Nextflow/modules/runFaQCs/resources/usr/bin/faqcs_ATGCcontent.py‎
Lines changed: 98 additions & 0 deletions
diff --git a/‎workflows/Nextflow/modules/runFaQCs/resources/usr/bin/faqcs_len_histogram.py‎
Lines changed: 75 additions & 0 deletions b/‎workflows/Nextflow/modules/runFaQCs/resources/usr/bin/faqcs_len_histogram.py‎
Lines changed: 75 additions & 0 deletions
@@ -0,0 +1,135 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+"""
+This script reads base content data from a specified file, computes statistics for each base per cycle (A, T, C, G, and GC), and generates a multi-panel plot using Plotly.
+It supports reading data before and after trimming, allowing for comparison of base content distributions.
+"""
+import pandas as pd
+import numpy as np
+import plotly.graph_objs as go
+from plotly.subplots import make_subplots
+import argparse
+
+def atcg_composition_plot(base_matrix_file, xlab, ylab, xlab_adj=0):
+    df = pd.read_csv(base_matrix_file, sep="\t", header=None, names=["A", "T", "C", "G", "N"])
+
+    row_sums = df[["A", "T", "C", "G", "N"]].sum(axis=1)
+    a_per = df["A"] / row_sums * 100
+    t_per = df["T"] / row_sums * 100
+    c_per = df["C"] / row_sums * 100
+    g_per = df["G"] / row_sums * 100
+    n_base = df["N"]
+    total_reads = max(row_sums)
+
+    xpos = np.arange(1, len(df) + 1)
+    labels = xpos + xlab_adj
+    ymax = np.floor(max(a_per.max(), t_per.max(), c_per.max(), g_per.max())) + 5
+    ymin = max(0, np.floor(min(a_per.min(), t_per.min(), c_per.min(), g_per.min())) - 5)
+
+    fig = go.Figure()
+    fig.add_trace(go.Scatter(x=labels, y=a_per, mode='lines', name='A', line=dict(color='green')))
+    fig.add_trace(go.Scatter(x=labels, y=t_per, mode='lines', name='T', line=dict(color='red')))
+    fig.add_trace(go.Scatter(x=labels, y=c_per, mode='lines', name='C', line=dict(color='blue')))
+    fig.add_trace(go.Scatter(x=labels, y=g_per, mode='lines', name='G', line=dict(color='black')))
+
+    fig.update_layout(
+        title=dict(text="Nucleotide Content Per Cycle", x=0.5),
+        xaxis=dict(title=xlab),
+        yaxis=dict(title=ylab, range=[ymin, ymax]),
+        legend=dict(title="Base")
+    )
+
+    return fig, n_base, total_reads
+
+def n_composition_plot(n_array, xlab, ylab, total_reads, xlab_adj=0):
+    xpos = np.arange(1, len(n_array) + 1)
+    labels = xpos + xlab_adj
+    n_rate = n_array / total_reads * 1_000_000
+
+    fig = go.Figure()
+    fig.add_trace(go.Scatter(x=labels, y=n_rate, mode='lines', name='N', line=dict(color='red')))
+    fig.update_layout(
+        title=dict(text="N Nucleotide Content Per Cycle", x=0.5),
+        xaxis_title=xlab,
+        yaxis_title=ylab,
+        showlegend=False
+    )
+    fig.add_annotation(
+        text=f"Total N bases: {int(n_array.sum())}",
+        x=0.99,
+        y=0.99,
+        xref="paper",
+        yref="paper",
+        showarrow=False,
+        font=dict(size=10),
+        align="right"
+    )
+    return fig
+
+def combine_atcg_plots(fig1, fig2):
+    combined = make_subplots(rows=1, cols=2, subplot_titles=("Input Reads Position", "Trimmed Reads Position"))
+
+    for trace in fig1.data:
+        combined.add_trace(trace, row=1, col=1)
+    for trace in fig2.data:
+        trace.showlegend = False
+        combined.add_trace(trace, row=1, col=2)
+
+    combined.update_layout(
+        title=dict(text="Nucleotide Content Per Cycle", x=0.5),
+        showlegend=True,
+    )
+    combined.update_xaxes(title_text="Cycle", row=1, col=1)
+    combined.update_yaxes(title_text="Base content (%)", row=1, col=1)
+    combined.update_xaxes(title_text="Cycle", row=1, col=2)
+
+    return combined
+
+def combine_n_plots(fig3, fig4):
+    combined = make_subplots(rows=1, cols=2, subplot_titles=("Input Reads Position", "Trimmed Reads Position"))
+
+    for trace in fig3.data:
+        combined.add_trace(trace, row=1, col=1)
+    for trace in fig4.data:
+        combined.add_trace(trace, row=1, col=2)
+
+    combined.update_layout(
+        title=dict(text="N Nucleotide Content Per Cycle", x=0.5),
+        showlegend=False,
+    )
+    combined.update_xaxes(title_text="Cycle", row=1, col=1)
+    combined.update_yaxes(title_text="N Base count per million reads", row=1, col=1)
+    combined.update_xaxes(title_text="Cycle", row=1, col=2)
+
+    return combined
+
+def main():
+    parser = argparse.ArgumentParser(description="Plot base composition across cycles.")
+    parser.add_argument("--input1", required=True, help="Base matrix file for input reads (e.g., qa.QC.base.matrix)")
+    parser.add_argument("--input2", required=True, help="Base matrix file for trimmed reads (e.g., QC.base.matrix)")
+    parser.add_argument("--out_atcg", required=True, help="Output HTML for combined ATCG composition")
+    parser.add_argument("--out_n", required=True, help="Output HTML for combined N base plot")
+    parser.add_argument("--trim5", type=int, default=0, help="Trim adjustment for 5' trimming (default: 0)")
+
+    args = parser.parse_args()
+
+    # Step 1: ATCG composition plots
+    fig1, qa_n_base, qa_total_reads = atcg_composition_plot(args.input1, "Input Reads Base", "Base content (%)", 0)
+    fig2, n_base, total_reads = atcg_composition_plot(args.input2, "Trimmed Reads Base", "", args.trim5)
+
+    combined_atcg = combine_atcg_plots(fig1, fig2)
+
+    combined_atcg.write_html(args.out_atcg)
+    
+    # Step 2: N base plots (if N present)
+    if qa_n_base.sum() > 0:
+        fig3 = n_composition_plot(qa_n_base, "Input Reads Position", "N Base count per million reads", qa_total_reads, 0)
+        fig4 = n_composition_plot(n_base, "Trimmed Reads Position", "", qa_total_reads, args.trim5)
+
+        combined_n = combine_n_plots(fig3, fig4)
+
+        combined_n.write_html(args.out_n)
+        print(f"[✓] N base plot saved to {args.out_n}")
+
+if __name__ == "__main__":
+    main()
@@ -0,0 +1,98 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+"""
+This script reads base content data from a specified file, computes statistics for each base ratio (A, T, C, G, and GC), and generates a multi-panel plot using Plotly.
+It supports reading data before and after trimming, allowing for comparison of base content distributions.
+"""
+import pandas as pd
+import numpy as np
+import plotly.graph_objs as go
+from plotly.subplots import make_subplots
+import argparse
+import os
+
+def read_gc_plot(base_content_file, title_prefix):
+    df = pd.read_csv(base_content_file, sep="\t", header=None, names=["Base", "Percent", "Count"])
+    
+    base_data = {}
+    for base in ["A", "T", "C", "G", "GC"]:
+        base_data[base] = df[df["Base"] == base][["Percent", "Count"]].astype(float)
+
+    def compute_stats(base_df):
+        avg = np.average(base_df["Percent"], weights=base_df["Count"])
+        std = np.sqrt(np.average((base_df["Percent"] - avg)**2, weights=base_df["Count"]))
+        bin_counts = base_df.groupby(pd.cut(base_df["Percent"], bins=np.arange(0, 101, 1)))["Count"].sum()
+        return avg, std, bin_counts.fillna(0)
+
+    stats = {base: compute_stats(base_data[base]) for base in ["GC", "A", "T", "C", "G"]}
+
+    fig = make_subplots(
+        rows=5, cols=1,
+        row_heights=[0.4, 0.15, 0.15, 0.15, 0.15],
+        shared_xaxes=False,
+        subplot_titles=[
+            f"{title_prefix} GC (%)",
+            f"A: {stats['A'][0]:.2f}% ± {stats['A'][1]:.2f}",
+            f"T: {stats['T'][0]:.2f}% ± {stats['T'][1]:.2f}",
+            f"C: {stats['C'][0]:.2f}% ± {stats['C'][1]:.2f}",
+            f"G: {stats['G'][0]:.2f}% ± {stats['G'][1]:.2f}"
+        ]
+    )
+
+    base_colors = {
+        "GC": "purple",
+        "A": "green",
+        "T": "red",
+        "C": "blue",
+        "G": "black"
+    }
+
+    fig.add_trace(go.Bar(
+        x=np.arange(0, 100),
+        y=stats["GC"][2].values / 1_000_000,
+        name="GC",
+        marker=dict(line=dict(width=0.5)),
+        marker_color=base_colors["GC"]
+    ), row=1, col=1)
+
+    for i, base in enumerate(["A", "T", "C", "G"], start=2):
+        fig.add_trace(go.Bar(
+            x=np.arange(0, 100),
+            y=stats[base][2].values / 1_000_000,
+            name=base,
+            marker_color=base_colors[base]
+        ), row=i, col=1)
+
+    fig.update_layout(
+        height=800,
+        title=dict(
+            text=f"Reads GC Content - {title_prefix}",
+            x=0.5,
+            xanchor='center'
+        ),
+        showlegend=False
+    )
+
+    fig.update_yaxes(title_text="Count (millions)", row=1, col=1)
+    return fig
+
+def main():
+    parser = argparse.ArgumentParser(description="Plot base content distributions from QC output.")
+    parser.add_argument("--input1", required=True, help="Input base content file before trimming (e.g., qa.QC.base_content.txt)")
+    parser.add_argument("--input2", required=True, help="Input base content file after trimming (e.g., QC.base_content.txt)")
+    parser.add_argument("--out1", required=True, help="Output HTML for the first GC plot")
+    parser.add_argument("--out2", required=True, help="Output HTML for the second GC plot")
+
+    args = parser.parse_args()
+
+    fig1 = read_gc_plot(args.input1, title_prefix="Input Reads")
+    fig2 = read_gc_plot(args.input2, title_prefix="Trimmed Reads")
+
+    fig1.write_html(args.out1)
+    print(f"Saved: {args.out1}")
+
+    fig2.write_html(args.out2)
+    print(f"Saved: {args.out2}")
+
+if __name__ == "__main__":
+    main()
@@ -0,0 +1,75 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+import pandas as pd
+import numpy as np
+import plotly.graph_objs as go
+from plotly.subplots import make_subplots
+import argparse
+
+def length_histogram(input_file, xlab, ylab):
+    df = pd.read_csv(input_file, sep="\t", header=None)
+    length_list = df.iloc[:, 0].astype(int)
+    length_count = df.iloc[:, 1].astype(int)
+    
+    # Stats
+    len_avg = np.average(length_list, weights=length_count)
+    len_std = np.sqrt(np.average((length_list - len_avg)**2, weights=length_count))
+    len_max = length_list[length_count > 0].max()
+    len_min = length_list[length_count > 0].min()
+    total_reads = length_count.sum()
+
+    # Bar chart
+    bar = go.Bar(
+        x=length_list,
+        y=length_count / 1_000_000,  # Convert to millions
+        name=f'{xlab}',
+        text=[f"{c/1_000_000:.2f}M" for c in length_count],
+        textposition='auto'
+    )
+
+    # Annotation text
+    annotations = [
+        f"Mean: {len_avg:.2f} ± {len_std:.2f}",
+        f"Max: {len_max}",
+        f"Min: {len_min}"
+    ]
+    return bar, annotations, total_reads
+
+def combine_length_histogram(fig1, fig2, fig1_anno, fig2_anno):
+    fig = make_subplots(rows=1, cols=2, subplot_titles=("Input Length", "Trimmed Length"))
+
+    fig.add_trace(fig1, row=1, col=1)
+    fig.add_trace(fig2, row=1, col=2)
+
+    fig.update_layout(
+        title=dict(
+            text="Reads Length Histogram",
+            x=0.5,
+            xanchor='center'
+        ),
+        annotations=[
+            dict(text="; ".join(fig1_anno), x=1, y=1, xref="x1", yref="paper", showarrow=False, align="left", font=dict(size=12)),
+            dict(text="; ".join(fig2_anno), x=1, y=1, xref="x2", yref="paper", showarrow=False, align="left", font=dict(size=12))
+        ],
+        xaxis_title="Length",
+        yaxis_title="Count (millions)",
+        bargap=0.1
+    )
+    return fig
+
+def main():
+    parser = argparse.ArgumentParser(description="Generate read length histograms from QC data.")
+    parser.add_argument("--input1", required=True, help="Input file for untrimmed read length histogram (TSV format)")
+    parser.add_argument("--input2", required=True, help="Input file for trimmed read length histogram (TSV format)")
+    parser.add_argument("--output", required=True, help="Output HTML file for the plot")
+
+    args = parser.parse_args()
+
+    qa_bar, qa_annot, qa_total = length_histogram(args.input1, "Input Length", "Count (millions)")
+    main_bar, main_annot, main_total = length_histogram(args.input2, "Trimmed Length", "Count (millions)")
+
+    fig = combine_length_histogram(qa_bar, main_bar, qa_annot, main_annot)
+    fig.write_html(args.output)
+
+if __name__ == "__main__":
+    main()