Skip to content

Commit 972f1d7

Browse files
authored
Merge pull request #144 from LANL-Bioinformatics/142-convert-faqcs-output-pdf-into-interactive-html
142 convert faqcs output pdf into interactive html
2 parents 897c1f6 + b3ddf79 commit 972f1d7

File tree

10 files changed

+889
-7
lines changed

10 files changed

+889
-7
lines changed
Lines changed: 135 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,135 @@
1+
#!/usr/bin/env python3
2+
# -*- coding: utf-8 -*-
3+
"""
4+
This script reads base content data from a specified file, computes statistics for each base per cycle (A, T, C, G, and GC), and generates a multi-panel plot using Plotly.
5+
It supports reading data before and after trimming, allowing for comparison of base content distributions.
6+
"""
7+
import pandas as pd
8+
import numpy as np
9+
import plotly.graph_objs as go
10+
from plotly.subplots import make_subplots
11+
import argparse
12+
13+
def atcg_composition_plot(base_matrix_file, xlab, ylab, xlab_adj=0):
14+
df = pd.read_csv(base_matrix_file, sep="\t", header=None, names=["A", "T", "C", "G", "N"])
15+
16+
row_sums = df[["A", "T", "C", "G", "N"]].sum(axis=1)
17+
a_per = df["A"] / row_sums * 100
18+
t_per = df["T"] / row_sums * 100
19+
c_per = df["C"] / row_sums * 100
20+
g_per = df["G"] / row_sums * 100
21+
n_base = df["N"]
22+
total_reads = max(row_sums)
23+
24+
xpos = np.arange(1, len(df) + 1)
25+
labels = xpos + xlab_adj
26+
ymax = np.floor(max(a_per.max(), t_per.max(), c_per.max(), g_per.max())) + 5
27+
ymin = max(0, np.floor(min(a_per.min(), t_per.min(), c_per.min(), g_per.min())) - 5)
28+
29+
fig = go.Figure()
30+
fig.add_trace(go.Scatter(x=labels, y=a_per, mode='lines', name='A', line=dict(color='green')))
31+
fig.add_trace(go.Scatter(x=labels, y=t_per, mode='lines', name='T', line=dict(color='red')))
32+
fig.add_trace(go.Scatter(x=labels, y=c_per, mode='lines', name='C', line=dict(color='blue')))
33+
fig.add_trace(go.Scatter(x=labels, y=g_per, mode='lines', name='G', line=dict(color='black')))
34+
35+
fig.update_layout(
36+
title=dict(text="Nucleotide Content Per Cycle", x=0.5),
37+
xaxis=dict(title=xlab),
38+
yaxis=dict(title=ylab, range=[ymin, ymax]),
39+
legend=dict(title="Base")
40+
)
41+
42+
return fig, n_base, total_reads
43+
44+
def n_composition_plot(n_array, xlab, ylab, total_reads, xlab_adj=0):
45+
xpos = np.arange(1, len(n_array) + 1)
46+
labels = xpos + xlab_adj
47+
n_rate = n_array / total_reads * 1_000_000
48+
49+
fig = go.Figure()
50+
fig.add_trace(go.Scatter(x=labels, y=n_rate, mode='lines', name='N', line=dict(color='red')))
51+
fig.update_layout(
52+
title=dict(text="N Nucleotide Content Per Cycle", x=0.5),
53+
xaxis_title=xlab,
54+
yaxis_title=ylab,
55+
showlegend=False
56+
)
57+
fig.add_annotation(
58+
text=f"Total N bases: {int(n_array.sum())}",
59+
x=0.99,
60+
y=0.99,
61+
xref="paper",
62+
yref="paper",
63+
showarrow=False,
64+
font=dict(size=10),
65+
align="right"
66+
)
67+
return fig
68+
69+
def combine_atcg_plots(fig1, fig2):
70+
combined = make_subplots(rows=1, cols=2, subplot_titles=("Input Reads Position", "Trimmed Reads Position"))
71+
72+
for trace in fig1.data:
73+
combined.add_trace(trace, row=1, col=1)
74+
for trace in fig2.data:
75+
trace.showlegend = False
76+
combined.add_trace(trace, row=1, col=2)
77+
78+
combined.update_layout(
79+
title=dict(text="Nucleotide Content Per Cycle", x=0.5),
80+
showlegend=True,
81+
)
82+
combined.update_xaxes(title_text="Cycle", row=1, col=1)
83+
combined.update_yaxes(title_text="Base content (%)", row=1, col=1)
84+
combined.update_xaxes(title_text="Cycle", row=1, col=2)
85+
86+
return combined
87+
88+
def combine_n_plots(fig3, fig4):
89+
combined = make_subplots(rows=1, cols=2, subplot_titles=("Input Reads Position", "Trimmed Reads Position"))
90+
91+
for trace in fig3.data:
92+
combined.add_trace(trace, row=1, col=1)
93+
for trace in fig4.data:
94+
combined.add_trace(trace, row=1, col=2)
95+
96+
combined.update_layout(
97+
title=dict(text="N Nucleotide Content Per Cycle", x=0.5),
98+
showlegend=False,
99+
)
100+
combined.update_xaxes(title_text="Cycle", row=1, col=1)
101+
combined.update_yaxes(title_text="N Base count per million reads", row=1, col=1)
102+
combined.update_xaxes(title_text="Cycle", row=1, col=2)
103+
104+
return combined
105+
106+
def main():
107+
parser = argparse.ArgumentParser(description="Plot base composition across cycles.")
108+
parser.add_argument("--input1", required=True, help="Base matrix file for input reads (e.g., qa.QC.base.matrix)")
109+
parser.add_argument("--input2", required=True, help="Base matrix file for trimmed reads (e.g., QC.base.matrix)")
110+
parser.add_argument("--out_atcg", required=True, help="Output HTML for combined ATCG composition")
111+
parser.add_argument("--out_n", required=True, help="Output HTML for combined N base plot")
112+
parser.add_argument("--trim5", type=int, default=0, help="Trim adjustment for 5' trimming (default: 0)")
113+
114+
args = parser.parse_args()
115+
116+
# Step 1: ATCG composition plots
117+
fig1, qa_n_base, qa_total_reads = atcg_composition_plot(args.input1, "Input Reads Base", "Base content (%)", 0)
118+
fig2, n_base, total_reads = atcg_composition_plot(args.input2, "Trimmed Reads Base", "", args.trim5)
119+
120+
combined_atcg = combine_atcg_plots(fig1, fig2)
121+
122+
combined_atcg.write_html(args.out_atcg)
123+
124+
# Step 2: N base plots (if N present)
125+
if qa_n_base.sum() > 0:
126+
fig3 = n_composition_plot(qa_n_base, "Input Reads Position", "N Base count per million reads", qa_total_reads, 0)
127+
fig4 = n_composition_plot(n_base, "Trimmed Reads Position", "", qa_total_reads, args.trim5)
128+
129+
combined_n = combine_n_plots(fig3, fig4)
130+
131+
combined_n.write_html(args.out_n)
132+
print(f"[✓] N base plot saved to {args.out_n}")
133+
134+
if __name__ == "__main__":
135+
main()
Lines changed: 98 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,98 @@
1+
#!/usr/bin/env python3
2+
# -*- coding: utf-8 -*-
3+
"""
4+
This script reads base content data from a specified file, computes statistics for each base ratio (A, T, C, G, and GC), and generates a multi-panel plot using Plotly.
5+
It supports reading data before and after trimming, allowing for comparison of base content distributions.
6+
"""
7+
import pandas as pd
8+
import numpy as np
9+
import plotly.graph_objs as go
10+
from plotly.subplots import make_subplots
11+
import argparse
12+
import os
13+
14+
def read_gc_plot(base_content_file, title_prefix):
15+
df = pd.read_csv(base_content_file, sep="\t", header=None, names=["Base", "Percent", "Count"])
16+
17+
base_data = {}
18+
for base in ["A", "T", "C", "G", "GC"]:
19+
base_data[base] = df[df["Base"] == base][["Percent", "Count"]].astype(float)
20+
21+
def compute_stats(base_df):
22+
avg = np.average(base_df["Percent"], weights=base_df["Count"])
23+
std = np.sqrt(np.average((base_df["Percent"] - avg)**2, weights=base_df["Count"]))
24+
bin_counts = base_df.groupby(pd.cut(base_df["Percent"], bins=np.arange(0, 101, 1)))["Count"].sum()
25+
return avg, std, bin_counts.fillna(0)
26+
27+
stats = {base: compute_stats(base_data[base]) for base in ["GC", "A", "T", "C", "G"]}
28+
29+
fig = make_subplots(
30+
rows=5, cols=1,
31+
row_heights=[0.4, 0.15, 0.15, 0.15, 0.15],
32+
shared_xaxes=False,
33+
subplot_titles=[
34+
f"{title_prefix} GC (%)",
35+
f"A: {stats['A'][0]:.2f}% ± {stats['A'][1]:.2f}",
36+
f"T: {stats['T'][0]:.2f}% ± {stats['T'][1]:.2f}",
37+
f"C: {stats['C'][0]:.2f}% ± {stats['C'][1]:.2f}",
38+
f"G: {stats['G'][0]:.2f}% ± {stats['G'][1]:.2f}"
39+
]
40+
)
41+
42+
base_colors = {
43+
"GC": "purple",
44+
"A": "green",
45+
"T": "red",
46+
"C": "blue",
47+
"G": "black"
48+
}
49+
50+
fig.add_trace(go.Bar(
51+
x=np.arange(0, 100),
52+
y=stats["GC"][2].values / 1_000_000,
53+
name="GC",
54+
marker=dict(line=dict(width=0.5)),
55+
marker_color=base_colors["GC"]
56+
), row=1, col=1)
57+
58+
for i, base in enumerate(["A", "T", "C", "G"], start=2):
59+
fig.add_trace(go.Bar(
60+
x=np.arange(0, 100),
61+
y=stats[base][2].values / 1_000_000,
62+
name=base,
63+
marker_color=base_colors[base]
64+
), row=i, col=1)
65+
66+
fig.update_layout(
67+
height=800,
68+
title=dict(
69+
text=f"Reads GC Content - {title_prefix}",
70+
x=0.5,
71+
xanchor='center'
72+
),
73+
showlegend=False
74+
)
75+
76+
fig.update_yaxes(title_text="Count (millions)", row=1, col=1)
77+
return fig
78+
79+
def main():
80+
parser = argparse.ArgumentParser(description="Plot base content distributions from QC output.")
81+
parser.add_argument("--input1", required=True, help="Input base content file before trimming (e.g., qa.QC.base_content.txt)")
82+
parser.add_argument("--input2", required=True, help="Input base content file after trimming (e.g., QC.base_content.txt)")
83+
parser.add_argument("--out1", required=True, help="Output HTML for the first GC plot")
84+
parser.add_argument("--out2", required=True, help="Output HTML for the second GC plot")
85+
86+
args = parser.parse_args()
87+
88+
fig1 = read_gc_plot(args.input1, title_prefix="Input Reads")
89+
fig2 = read_gc_plot(args.input2, title_prefix="Trimmed Reads")
90+
91+
fig1.write_html(args.out1)
92+
print(f"Saved: {args.out1}")
93+
94+
fig2.write_html(args.out2)
95+
print(f"Saved: {args.out2}")
96+
97+
if __name__ == "__main__":
98+
main()
Lines changed: 75 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,75 @@
1+
#!/usr/bin/env python3
2+
# -*- coding: utf-8 -*-
3+
import pandas as pd
4+
import numpy as np
5+
import plotly.graph_objs as go
6+
from plotly.subplots import make_subplots
7+
import argparse
8+
9+
def length_histogram(input_file, xlab, ylab):
10+
df = pd.read_csv(input_file, sep="\t", header=None)
11+
length_list = df.iloc[:, 0].astype(int)
12+
length_count = df.iloc[:, 1].astype(int)
13+
14+
# Stats
15+
len_avg = np.average(length_list, weights=length_count)
16+
len_std = np.sqrt(np.average((length_list - len_avg)**2, weights=length_count))
17+
len_max = length_list[length_count > 0].max()
18+
len_min = length_list[length_count > 0].min()
19+
total_reads = length_count.sum()
20+
21+
# Bar chart
22+
bar = go.Bar(
23+
x=length_list,
24+
y=length_count / 1_000_000, # Convert to millions
25+
name=f'{xlab}',
26+
text=[f"{c/1_000_000:.2f}M" for c in length_count],
27+
textposition='auto'
28+
)
29+
30+
# Annotation text
31+
annotations = [
32+
f"Mean: {len_avg:.2f} ± {len_std:.2f}",
33+
f"Max: {len_max}",
34+
f"Min: {len_min}"
35+
]
36+
return bar, annotations, total_reads
37+
38+
def combine_length_histogram(fig1, fig2, fig1_anno, fig2_anno):
39+
fig = make_subplots(rows=1, cols=2, subplot_titles=("Input Length", "Trimmed Length"))
40+
41+
fig.add_trace(fig1, row=1, col=1)
42+
fig.add_trace(fig2, row=1, col=2)
43+
44+
fig.update_layout(
45+
title=dict(
46+
text="Reads Length Histogram",
47+
x=0.5,
48+
xanchor='center'
49+
),
50+
annotations=[
51+
dict(text="; ".join(fig1_anno), x=1, y=1, xref="x1", yref="paper", showarrow=False, align="left", font=dict(size=12)),
52+
dict(text="; ".join(fig2_anno), x=1, y=1, xref="x2", yref="paper", showarrow=False, align="left", font=dict(size=12))
53+
],
54+
xaxis_title="Length",
55+
yaxis_title="Count (millions)",
56+
bargap=0.1
57+
)
58+
return fig
59+
60+
def main():
61+
parser = argparse.ArgumentParser(description="Generate read length histograms from QC data.")
62+
parser.add_argument("--input1", required=True, help="Input file for untrimmed read length histogram (TSV format)")
63+
parser.add_argument("--input2", required=True, help="Input file for trimmed read length histogram (TSV format)")
64+
parser.add_argument("--output", required=True, help="Output HTML file for the plot")
65+
66+
args = parser.parse_args()
67+
68+
qa_bar, qa_annot, qa_total = length_histogram(args.input1, "Input Length", "Count (millions)")
69+
main_bar, main_annot, main_total = length_histogram(args.input2, "Trimmed Length", "Count (millions)")
70+
71+
fig = combine_length_histogram(qa_bar, main_bar, qa_annot, main_annot)
72+
fig.write_html(args.output)
73+
74+
if __name__ == "__main__":
75+
main()

0 commit comments

Comments
 (0)