Skip to content

Commit 0821215

Browse files
edmundmillerclaude
andcommitted
refactor: extract Python script from main.nf to bin directory
Moves embedded Python script from dupRadar main.nf to separate bin script for better maintainability and cleaner code organization. The Python analysis logic is now contained in dupradar_fast_analysis.py with proper argument parsing. - Extract 150+ line Python script to bin/dupradar_fast_analysis.py - Replace embedded script with simple bin script call - Maintain identical functionality and output formats - Improve code readability and maintainability 🤖 Generated with [Claude Code](https://claude.ai/code) Co-Authored-By: Claude <[email protected]>
1 parent 57b5f4f commit 0821215

File tree

2 files changed

+184
-176
lines changed

2 files changed

+184
-176
lines changed

bin/dupradar_fast_analysis.py

Lines changed: 179 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,179 @@
1+
#!/usr/bin/env python3
2+
"""
3+
Fast dupRadar analysis script
4+
Replaces R/dupRadar with Python for high-performance processing
5+
Maintains full MultiQC compatibility
6+
"""
7+
8+
import sys
9+
import pandas as pd
10+
import numpy as np
11+
import matplotlib.pyplot as plt
12+
from scipy.stats import linregress
13+
import argparse
14+
15+
def main():
16+
parser = argparse.ArgumentParser(description='Fast dupRadar analysis')
17+
parser.add_argument('--with-dups', required=True, help='featureCounts output with duplicates')
18+
parser.add_argument('--no-dups', required=True, help='featureCounts output without duplicates')
19+
parser.add_argument('--prefix', required=True, help='Output prefix')
20+
args = parser.parse_args()
21+
22+
# Read featureCounts outputs
23+
with_dups = pd.read_csv(args.with_dups, sep='\t', comment='#', skiprows=1)
24+
no_dups = pd.read_csv(args.no_dups, sep='\t', comment='#', skiprows=1)
25+
26+
# Extract count columns (last column is the BAM file)
27+
with_dups_counts = with_dups.iloc[:, -1]
28+
no_dups_counts = no_dups.iloc[:, -1]
29+
30+
# Calculate RPK (Reads per Kilobase)
31+
gene_lengths = with_dups['Length'] / 1000 # Convert to kb
32+
with_dups_rpk = with_dups_counts / gene_lengths
33+
no_dups_rpk = no_dups_counts / gene_lengths
34+
35+
# Calculate duplication rate
36+
duplication_rate = np.where(with_dups_rpk > 0, 1 - (no_dups_rpk / with_dups_rpk), 0)
37+
38+
# Create dupMatrix
39+
dup_matrix = pd.DataFrame({
40+
'gene_id': with_dups['Geneid'],
41+
'rpkm': with_dups_rpk, # Using RPK instead of RPKM for simplicity
42+
'duplication_rate': duplication_rate
43+
})
44+
45+
# Filter out genes with zero expression
46+
dup_matrix_filtered = dup_matrix[dup_matrix['rpkm'] > 0]
47+
48+
# Save dupMatrix
49+
dup_matrix_filtered.to_csv(f"{args.prefix}_dupMatrix.txt", sep='\t', index=False)
50+
51+
# Calculate intercept and slope
52+
if len(dup_matrix_filtered) > 1:
53+
log_rpkm = np.log10(dup_matrix_filtered['rpkm'])
54+
slope, intercept, r_value, p_value, std_err = linregress(log_rpkm, dup_matrix_filtered['duplication_rate'])
55+
56+
# Save intercept and slope
57+
with open(f"{args.prefix}_intercept_slope.txt", 'w') as f:
58+
f.write(f"intercept\t{intercept}\n")
59+
f.write(f"slope\t{slope}\n")
60+
f.write(f"r_squared\t{r_value**2}\n")
61+
62+
# Generate MultiQC files
63+
with open(f"{args.prefix}_dup_intercept_mqc.txt", 'w') as f:
64+
f.write("# plot_type: 'generalstats'\n")
65+
f.write("# pconfig:\n")
66+
f.write("# dupradar_intercept:\n")
67+
f.write("# title: 'dupRadar Intercept'\n")
68+
f.write("# description: 'Intercept of the dupRadar fitted curve'\n")
69+
f.write("# format: '{:.3f}'\n")
70+
f.write("Sample\tdupradar_intercept\n")
71+
f.write(f"{args.prefix}\t{intercept:.3f}\n")
72+
73+
with open(f"{args.prefix}_duprateExpDensCurve_mqc.txt", 'w') as f:
74+
f.write("# plot_type: 'linegraph'\n")
75+
f.write("# section_name: 'dupRadar'\n")
76+
f.write("# description: 'Expression vs Duplication Rate'\n")
77+
f.write("# pconfig:\n")
78+
f.write("# title: 'dupRadar: Expression vs Duplication Rate'\n")
79+
f.write("# xlab: 'Expression (log10 RPK)'\n")
80+
f.write("# ylab: 'Duplication Rate'\n")
81+
82+
# Sample data points for the curve
83+
x_points = np.linspace(log_rpkm.min(), log_rpkm.max(), 100)
84+
y_points = slope * x_points + intercept
85+
86+
f.write("log10_rpk\tduplication_rate\n")
87+
for x, y in zip(x_points, y_points):
88+
f.write(f"{x:.3f}\t{y:.3f}\n")
89+
90+
# Generate basic plots using matplotlib
91+
fig, ((ax1, ax2), (ax3, ax4)) = plt.subplots(2, 2, figsize=(12, 10))
92+
93+
# Scatter plot
94+
ax1.scatter(log_rpkm, dup_matrix_filtered['duplication_rate'], alpha=0.6, s=1)
95+
ax1.plot(x_points, y_points, 'r-', linewidth=2)
96+
ax1.set_xlabel('Expression (log10 RPK)')
97+
ax1.set_ylabel('Duplication Rate')
98+
ax1.set_title('dupRadar: Expression vs Duplication Rate')
99+
100+
# Box plot (simplified)
101+
ax2.boxplot([dup_matrix_filtered['duplication_rate']])
102+
ax2.set_ylabel('Duplication Rate')
103+
ax2.set_title('Duplication Rate Distribution')
104+
105+
# Expression histogram
106+
ax3.hist(log_rpkm, bins=50, alpha=0.7)
107+
ax3.set_xlabel('Expression (log10 RPK)')
108+
ax3.set_ylabel('Frequency')
109+
ax3.set_title('Expression Distribution')
110+
111+
# Duplication rate histogram
112+
ax4.hist(dup_matrix_filtered['duplication_rate'], bins=50, alpha=0.7)
113+
ax4.set_xlabel('Duplication Rate')
114+
ax4.set_ylabel('Frequency')
115+
ax4.set_title('Duplication Rate Distribution')
116+
117+
plt.tight_layout()
118+
plt.savefig(f"{args.prefix}_duprateExpDens.pdf")
119+
plt.close()
120+
121+
# Individual plots for compatibility
122+
plt.figure(figsize=(8, 6))
123+
plt.boxplot([dup_matrix_filtered['duplication_rate']])
124+
plt.ylabel('Duplication Rate')
125+
plt.title('Duplication Rate Distribution')
126+
plt.savefig(f"{args.prefix}_duprateExpBoxplot.pdf")
127+
plt.close()
128+
129+
plt.figure(figsize=(8, 6))
130+
plt.hist(log_rpkm, bins=50, alpha=0.7)
131+
plt.xlabel('Expression (log10 RPK)')
132+
plt.ylabel('Frequency')
133+
plt.title('Expression Distribution')
134+
plt.savefig(f"{args.prefix}_expressionHist.pdf")
135+
plt.close()
136+
137+
else:
138+
# Handle case with no/insufficient data
139+
with open(f"{args.prefix}_intercept_slope.txt", 'w') as f:
140+
f.write("intercept\tNA\n")
141+
f.write("slope\tNA\n")
142+
f.write("r_squared\tNA\n")
143+
144+
# Generate empty MultiQC files
145+
with open(f"{args.prefix}_dup_intercept_mqc.txt", 'w') as f:
146+
f.write("# plot_type: 'generalstats'\n")
147+
f.write("# pconfig:\n")
148+
f.write("# dupradar_intercept:\n")
149+
f.write("# title: 'dupRadar Intercept'\n")
150+
f.write("# description: 'Intercept of the dupRadar fitted curve'\n")
151+
f.write("# format: '{:.3f}'\n")
152+
f.write("Sample\tdupradar_intercept\n")
153+
f.write(f"{args.prefix}\tNA\n")
154+
155+
with open(f"{args.prefix}_duprateExpDensCurve_mqc.txt", 'w') as f:
156+
f.write("# plot_type: 'linegraph'\n")
157+
f.write("# section_name: 'dupRadar'\n")
158+
f.write("# description: 'Expression vs Duplication Rate'\n")
159+
f.write("# pconfig:\n")
160+
f.write("# title: 'dupRadar: Expression vs Duplication Rate'\n")
161+
f.write("# xlab: 'Expression (log10 RPK)'\n")
162+
f.write("# ylab: 'Duplication Rate'\n")
163+
f.write("log10_rpk\tduplication_rate\n")
164+
f.write("0\t0\n")
165+
f.write("1\t0\n")
166+
167+
# Create empty plots
168+
fig, ax = plt.subplots(figsize=(8, 6))
169+
ax.text(0.5, 0.5, 'Insufficient data for analysis',
170+
horizontalalignment='center', verticalalignment='center', transform=ax.transAxes)
171+
plt.savefig(f"{args.prefix}_duprateExpDens.pdf")
172+
plt.savefig(f"{args.prefix}_duprateExpBoxplot.pdf")
173+
plt.savefig(f"{args.prefix}_expressionHist.pdf")
174+
plt.close()
175+
176+
print("Analysis complete")
177+
178+
if __name__ == "__main__":
179+
main()

modules/nf-core/dupradar/main.nf

Lines changed: 5 additions & 176 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

0 commit comments

Comments
 (0)