1+ #!/usr/bin/env python3
2+ """
3+ Fast dupRadar analysis script
4+ Replaces R/dupRadar with Python for high-performance processing
5+ Maintains full MultiQC compatibility
6+ """
7+
8+ import sys
9+ import pandas as pd
10+ import numpy as np
11+ import matplotlib .pyplot as plt
12+ from scipy .stats import linregress
13+ import argparse
14+
15+ def main ():
16+ parser = argparse .ArgumentParser (description = 'Fast dupRadar analysis' )
17+ parser .add_argument ('--with-dups' , required = True , help = 'featureCounts output with duplicates' )
18+ parser .add_argument ('--no-dups' , required = True , help = 'featureCounts output without duplicates' )
19+ parser .add_argument ('--prefix' , required = True , help = 'Output prefix' )
20+ args = parser .parse_args ()
21+
22+ # Read featureCounts outputs
23+ with_dups = pd .read_csv (args .with_dups , sep = '\t ' , comment = '#' , skiprows = 1 )
24+ no_dups = pd .read_csv (args .no_dups , sep = '\t ' , comment = '#' , skiprows = 1 )
25+
26+ # Extract count columns (last column is the BAM file)
27+ with_dups_counts = with_dups .iloc [:, - 1 ]
28+ no_dups_counts = no_dups .iloc [:, - 1 ]
29+
30+ # Calculate RPK (Reads per Kilobase)
31+ gene_lengths = with_dups ['Length' ] / 1000 # Convert to kb
32+ with_dups_rpk = with_dups_counts / gene_lengths
33+ no_dups_rpk = no_dups_counts / gene_lengths
34+
35+ # Calculate duplication rate
36+ duplication_rate = np .where (with_dups_rpk > 0 , 1 - (no_dups_rpk / with_dups_rpk ), 0 )
37+
38+ # Create dupMatrix
39+ dup_matrix = pd .DataFrame ({
40+ 'gene_id' : with_dups ['Geneid' ],
41+ 'rpkm' : with_dups_rpk , # Using RPK instead of RPKM for simplicity
42+ 'duplication_rate' : duplication_rate
43+ })
44+
45+ # Filter out genes with zero expression
46+ dup_matrix_filtered = dup_matrix [dup_matrix ['rpkm' ] > 0 ]
47+
48+ # Save dupMatrix
49+ dup_matrix_filtered .to_csv (f"{ args .prefix } _dupMatrix.txt" , sep = '\t ' , index = False )
50+
51+ # Calculate intercept and slope
52+ if len (dup_matrix_filtered ) > 1 :
53+ log_rpkm = np .log10 (dup_matrix_filtered ['rpkm' ])
54+ slope , intercept , r_value , p_value , std_err = linregress (log_rpkm , dup_matrix_filtered ['duplication_rate' ])
55+
56+ # Save intercept and slope
57+ with open (f"{ args .prefix } _intercept_slope.txt" , 'w' ) as f :
58+ f .write (f"intercept\t { intercept } \n " )
59+ f .write (f"slope\t { slope } \n " )
60+ f .write (f"r_squared\t { r_value ** 2 } \n " )
61+
62+ # Generate MultiQC files
63+ with open (f"{ args .prefix } _dup_intercept_mqc.txt" , 'w' ) as f :
64+ f .write ("# plot_type: 'generalstats'\n " )
65+ f .write ("# pconfig:\n " )
66+ f .write ("# dupradar_intercept:\n " )
67+ f .write ("# title: 'dupRadar Intercept'\n " )
68+ f .write ("# description: 'Intercept of the dupRadar fitted curve'\n " )
69+ f .write ("# format: '{:.3f}'\n " )
70+ f .write ("Sample\t dupradar_intercept\n " )
71+ f .write (f"{ args .prefix } \t { intercept :.3f} \n " )
72+
73+ with open (f"{ args .prefix } _duprateExpDensCurve_mqc.txt" , 'w' ) as f :
74+ f .write ("# plot_type: 'linegraph'\n " )
75+ f .write ("# section_name: 'dupRadar'\n " )
76+ f .write ("# description: 'Expression vs Duplication Rate'\n " )
77+ f .write ("# pconfig:\n " )
78+ f .write ("# title: 'dupRadar: Expression vs Duplication Rate'\n " )
79+ f .write ("# xlab: 'Expression (log10 RPK)'\n " )
80+ f .write ("# ylab: 'Duplication Rate'\n " )
81+
82+ # Sample data points for the curve
83+ x_points = np .linspace (log_rpkm .min (), log_rpkm .max (), 100 )
84+ y_points = slope * x_points + intercept
85+
86+ f .write ("log10_rpk\t duplication_rate\n " )
87+ for x , y in zip (x_points , y_points ):
88+ f .write (f"{ x :.3f} \t { y :.3f} \n " )
89+
90+ # Generate basic plots using matplotlib
91+ fig , ((ax1 , ax2 ), (ax3 , ax4 )) = plt .subplots (2 , 2 , figsize = (12 , 10 ))
92+
93+ # Scatter plot
94+ ax1 .scatter (log_rpkm , dup_matrix_filtered ['duplication_rate' ], alpha = 0.6 , s = 1 )
95+ ax1 .plot (x_points , y_points , 'r-' , linewidth = 2 )
96+ ax1 .set_xlabel ('Expression (log10 RPK)' )
97+ ax1 .set_ylabel ('Duplication Rate' )
98+ ax1 .set_title ('dupRadar: Expression vs Duplication Rate' )
99+
100+ # Box plot (simplified)
101+ ax2 .boxplot ([dup_matrix_filtered ['duplication_rate' ]])
102+ ax2 .set_ylabel ('Duplication Rate' )
103+ ax2 .set_title ('Duplication Rate Distribution' )
104+
105+ # Expression histogram
106+ ax3 .hist (log_rpkm , bins = 50 , alpha = 0.7 )
107+ ax3 .set_xlabel ('Expression (log10 RPK)' )
108+ ax3 .set_ylabel ('Frequency' )
109+ ax3 .set_title ('Expression Distribution' )
110+
111+ # Duplication rate histogram
112+ ax4 .hist (dup_matrix_filtered ['duplication_rate' ], bins = 50 , alpha = 0.7 )
113+ ax4 .set_xlabel ('Duplication Rate' )
114+ ax4 .set_ylabel ('Frequency' )
115+ ax4 .set_title ('Duplication Rate Distribution' )
116+
117+ plt .tight_layout ()
118+ plt .savefig (f"{ args .prefix } _duprateExpDens.pdf" )
119+ plt .close ()
120+
121+ # Individual plots for compatibility
122+ plt .figure (figsize = (8 , 6 ))
123+ plt .boxplot ([dup_matrix_filtered ['duplication_rate' ]])
124+ plt .ylabel ('Duplication Rate' )
125+ plt .title ('Duplication Rate Distribution' )
126+ plt .savefig (f"{ args .prefix } _duprateExpBoxplot.pdf" )
127+ plt .close ()
128+
129+ plt .figure (figsize = (8 , 6 ))
130+ plt .hist (log_rpkm , bins = 50 , alpha = 0.7 )
131+ plt .xlabel ('Expression (log10 RPK)' )
132+ plt .ylabel ('Frequency' )
133+ plt .title ('Expression Distribution' )
134+ plt .savefig (f"{ args .prefix } _expressionHist.pdf" )
135+ plt .close ()
136+
137+ else :
138+ # Handle case with no/insufficient data
139+ with open (f"{ args .prefix } _intercept_slope.txt" , 'w' ) as f :
140+ f .write ("intercept\t NA\n " )
141+ f .write ("slope\t NA\n " )
142+ f .write ("r_squared\t NA\n " )
143+
144+ # Generate empty MultiQC files
145+ with open (f"{ args .prefix } _dup_intercept_mqc.txt" , 'w' ) as f :
146+ f .write ("# plot_type: 'generalstats'\n " )
147+ f .write ("# pconfig:\n " )
148+ f .write ("# dupradar_intercept:\n " )
149+ f .write ("# title: 'dupRadar Intercept'\n " )
150+ f .write ("# description: 'Intercept of the dupRadar fitted curve'\n " )
151+ f .write ("# format: '{:.3f}'\n " )
152+ f .write ("Sample\t dupradar_intercept\n " )
153+ f .write (f"{ args .prefix } \t NA\n " )
154+
155+ with open (f"{ args .prefix } _duprateExpDensCurve_mqc.txt" , 'w' ) as f :
156+ f .write ("# plot_type: 'linegraph'\n " )
157+ f .write ("# section_name: 'dupRadar'\n " )
158+ f .write ("# description: 'Expression vs Duplication Rate'\n " )
159+ f .write ("# pconfig:\n " )
160+ f .write ("# title: 'dupRadar: Expression vs Duplication Rate'\n " )
161+ f .write ("# xlab: 'Expression (log10 RPK)'\n " )
162+ f .write ("# ylab: 'Duplication Rate'\n " )
163+ f .write ("log10_rpk\t duplication_rate\n " )
164+ f .write ("0\t 0\n " )
165+ f .write ("1\t 0\n " )
166+
167+ # Create empty plots
168+ fig , ax = plt .subplots (figsize = (8 , 6 ))
169+ ax .text (0.5 , 0.5 , 'Insufficient data for analysis' ,
170+ horizontalalignment = 'center' , verticalalignment = 'center' , transform = ax .transAxes )
171+ plt .savefig (f"{ args .prefix } _duprateExpDens.pdf" )
172+ plt .savefig (f"{ args .prefix } _duprateExpBoxplot.pdf" )
173+ plt .savefig (f"{ args .prefix } _expressionHist.pdf" )
174+ plt .close ()
175+
176+ print ("Analysis complete" )
177+
178+ if __name__ == "__main__" :
179+ main ()
0 commit comments