1+ #!/usr/bin/env python3
2+ # -*- coding: utf-8 -*-
3+ """
4+ This script reads base content data from a specified file, computes statistics for each base per cycle (A, T, C, G, and GC), and generates a multi-panel plot using Plotly.
5+ It supports reading data before and after trimming, allowing for comparison of base content distributions.
6+ """
7+ import pandas as pd
8+ import numpy as np
9+ import plotly .graph_objs as go
10+ from plotly .subplots import make_subplots
11+ import argparse
12+
13+ def atcg_composition_plot (base_matrix_file , xlab , ylab , xlab_adj = 0 ):
14+ df = pd .read_csv (base_matrix_file , sep = "\t " , header = None , names = ["A" , "T" , "C" , "G" , "N" ])
15+
16+ row_sums = df [["A" , "T" , "C" , "G" , "N" ]].sum (axis = 1 )
17+ a_per = df ["A" ] / row_sums * 100
18+ t_per = df ["T" ] / row_sums * 100
19+ c_per = df ["C" ] / row_sums * 100
20+ g_per = df ["G" ] / row_sums * 100
21+ n_base = df ["N" ]
22+ total_reads = max (row_sums )
23+
24+ xpos = np .arange (1 , len (df ) + 1 )
25+ labels = xpos + xlab_adj
26+ ymax = np .floor (max (a_per .max (), t_per .max (), c_per .max (), g_per .max ())) + 5
27+ ymin = max (0 , np .floor (min (a_per .min (), t_per .min (), c_per .min (), g_per .min ())) - 5 )
28+
29+ fig = go .Figure ()
30+ fig .add_trace (go .Scatter (x = labels , y = a_per , mode = 'lines' , name = 'A' , line = dict (color = 'green' )))
31+ fig .add_trace (go .Scatter (x = labels , y = t_per , mode = 'lines' , name = 'T' , line = dict (color = 'red' )))
32+ fig .add_trace (go .Scatter (x = labels , y = c_per , mode = 'lines' , name = 'C' , line = dict (color = 'blue' )))
33+ fig .add_trace (go .Scatter (x = labels , y = g_per , mode = 'lines' , name = 'G' , line = dict (color = 'black' )))
34+
35+ fig .update_layout (
36+ title = dict (text = "Nucleotide Content Per Cycle" , x = 0.5 ),
37+ xaxis = dict (title = xlab ),
38+ yaxis = dict (title = ylab , range = [ymin , ymax ]),
39+ legend = dict (title = "Base" )
40+ )
41+
42+ return fig , n_base , total_reads
43+
44+ def n_composition_plot (n_array , xlab , ylab , total_reads , xlab_adj = 0 ):
45+ xpos = np .arange (1 , len (n_array ) + 1 )
46+ labels = xpos + xlab_adj
47+ n_rate = n_array / total_reads * 1_000_000
48+
49+ fig = go .Figure ()
50+ fig .add_trace (go .Scatter (x = labels , y = n_rate , mode = 'lines' , name = 'N' , line = dict (color = 'red' )))
51+ fig .update_layout (
52+ title = dict (text = "N Nucleotide Content Per Cycle" , x = 0.5 ),
53+ xaxis_title = xlab ,
54+ yaxis_title = ylab ,
55+ showlegend = False
56+ )
57+ fig .add_annotation (
58+ text = f"Total N bases: { int (n_array .sum ())} " ,
59+ x = 0.99 ,
60+ y = 0.99 ,
61+ xref = "paper" ,
62+ yref = "paper" ,
63+ showarrow = False ,
64+ font = dict (size = 10 ),
65+ align = "right"
66+ )
67+ return fig
68+
69+ def combine_atcg_plots (fig1 , fig2 ):
70+ combined = make_subplots (rows = 1 , cols = 2 , subplot_titles = ("Input Reads Position" , "Trimmed Reads Position" ))
71+
72+ for trace in fig1 .data :
73+ combined .add_trace (trace , row = 1 , col = 1 )
74+ for trace in fig2 .data :
75+ trace .showlegend = False
76+ combined .add_trace (trace , row = 1 , col = 2 )
77+
78+ combined .update_layout (
79+ title = dict (text = "Nucleotide Content Per Cycle" , x = 0.5 ),
80+ showlegend = True ,
81+ )
82+ combined .update_xaxes (title_text = "Cycle" , row = 1 , col = 1 )
83+ combined .update_yaxes (title_text = "Base content (%)" , row = 1 , col = 1 )
84+ combined .update_xaxes (title_text = "Cycle" , row = 1 , col = 2 )
85+
86+ return combined
87+
88+ def combine_n_plots (fig3 , fig4 ):
89+ combined = make_subplots (rows = 1 , cols = 2 , subplot_titles = ("Input Reads Position" , "Trimmed Reads Position" ))
90+
91+ for trace in fig3 .data :
92+ combined .add_trace (trace , row = 1 , col = 1 )
93+ for trace in fig4 .data :
94+ combined .add_trace (trace , row = 1 , col = 2 )
95+
96+ combined .update_layout (
97+ title = dict (text = "N Nucleotide Content Per Cycle" , x = 0.5 ),
98+ showlegend = False ,
99+ )
100+ combined .update_xaxes (title_text = "Cycle" , row = 1 , col = 1 )
101+ combined .update_yaxes (title_text = "N Base count per million reads" , row = 1 , col = 1 )
102+ combined .update_xaxes (title_text = "Cycle" , row = 1 , col = 2 )
103+
104+ return combined
105+
106+ def main ():
107+ parser = argparse .ArgumentParser (description = "Plot base composition across cycles." )
108+ parser .add_argument ("--input1" , required = True , help = "Base matrix file for input reads (e.g., qa.QC.base.matrix)" )
109+ parser .add_argument ("--input2" , required = True , help = "Base matrix file for trimmed reads (e.g., QC.base.matrix)" )
110+ parser .add_argument ("--out_atcg" , required = True , help = "Output HTML for combined ATCG composition" )
111+ parser .add_argument ("--out_n" , required = True , help = "Output HTML for combined N base plot" )
112+ parser .add_argument ("--trim5" , type = int , default = 0 , help = "Trim adjustment for 5' trimming (default: 0)" )
113+
114+ args = parser .parse_args ()
115+
116+ # Step 1: ATCG composition plots
117+ fig1 , qa_n_base , qa_total_reads = atcg_composition_plot (args .input1 , "Input Reads Base" , "Base content (%)" , 0 )
118+ fig2 , n_base , total_reads = atcg_composition_plot (args .input2 , "Trimmed Reads Base" , "" , args .trim5 )
119+
120+ combined_atcg = combine_atcg_plots (fig1 , fig2 )
121+
122+ combined_atcg .write_html (args .out_atcg )
123+
124+ # Step 2: N base plots (if N present)
125+ if qa_n_base .sum () > 0 :
126+ fig3 = n_composition_plot (qa_n_base , "Input Reads Position" , "N Base count per million reads" , qa_total_reads , 0 )
127+ fig4 = n_composition_plot (n_base , "Trimmed Reads Position" , "" , qa_total_reads , args .trim5 )
128+
129+ combined_n = combine_n_plots (fig3 , fig4 )
130+
131+ combined_n .write_html (args .out_n )
132+ print (f"[✓] N base plot saved to { args .out_n } " )
133+
134+ if __name__ == "__main__" :
135+ main ()
0 commit comments