Skip to content

Commit 22bad56

Browse files
feat: add reuse_outputs_from config to skip upstream re-computation
When set, creates parse-time symlinks for reference-independent outputs (pod5/, demux/, bam/rebasecall/, fq/) so only alignment and downstream rules run. Enables running the pipeline with different references on identical raw data without re-doing basecalling/demux. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
1 parent 43ef7cd commit 22bad56

File tree

2 files changed

+37
-0
lines changed

2 files changed

+37
-0
lines changed

config/config-base.yml

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -46,6 +46,11 @@ reference:
4646
mode: "validate"
4747
# For build mode only: path to raw tRNA FASTA (without adapters, must end in CCA)
4848
raw_fasta: null
49+
# Optional spacer inserted between CCA and 3' adapter in the reference.
50+
# Reads will align with a deletion at this position, absorbing signal
51+
# perturbation from the aminoacyl group on charged tRNAs.
52+
# Default "" = no spacer (backward compatible). Set to e.g. "T" to enable.
53+
cca_spacer: ""
4954

5055
# Optional: provide a kmer table to extract signal metrics using remora
5156
# Set to a path (e.g. "resources/kmers/9mer_levels_v1.txt") to enable
@@ -123,6 +128,12 @@ report:
123128
# path to a .qmd file to include at the end of the QC report (optional)
124129
custom_include: ""
125130

131+
# Reuse reference-independent outputs from a previous pipeline run.
132+
# Set to the output_directory of a completed run to skip POD5 merging,
133+
# basecalling, FASTQ extraction, and demultiplexing.
134+
# Both runs must use the same samples/raw data — only reference can differ.
135+
reuse_outputs_from: null
136+
126137
warpdemux:
127138
enabled: false
128139
barcode_kit: "WDX4_tRNA_rna004_v1_0"

workflow/rules/common.smk

Lines changed: 26 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -207,6 +207,32 @@ def find_raw_inputs(sample_dict):
207207
# set up global samples dictionary to be used throughout pipeline
208208
outdir = config["output_directory"]
209209

210+
_reuse_from = config.get("reuse_outputs_from")
211+
if _reuse_from:
212+
_reuse_from = os.path.realpath(_reuse_from)
213+
# Guard: source must exist
214+
if not os.path.isdir(_reuse_from):
215+
sys.exit(f"reuse_outputs_from: directory not found: {_reuse_from}")
216+
# Guard: must not be same as outdir
217+
os.makedirs(outdir, exist_ok=True)
218+
if os.path.realpath(_reuse_from) == os.path.realpath(outdir):
219+
sys.exit("reuse_outputs_from cannot be the same as output_directory")
220+
221+
_REUSE_DIRS = ["pod5", "demux", "bam/rebasecall", "fq"]
222+
for _subdir in _REUSE_DIRS:
223+
_src = os.path.join(_reuse_from, _subdir)
224+
_dst = os.path.join(os.path.realpath(outdir), _subdir)
225+
if not os.path.isdir(_src):
226+
continue # skip missing (e.g., demux/ when demux disabled)
227+
os.makedirs(os.path.dirname(_dst), exist_ok=True)
228+
if os.path.exists(_dst):
229+
if os.path.islink(_dst) and os.path.realpath(_dst) == os.path.realpath(_src):
230+
continue # already linked correctly
231+
sys.exit(f"reuse_outputs_from: {_dst} already exists. Remove it first.")
232+
os.symlink(os.path.realpath(_src), _dst)
233+
print(f"reuse_outputs_from: {_subdir}/ -> {os.path.realpath(_src)}")
234+
del _reuse_from, _REUSE_DIRS
235+
210236
samples = parse_samples(config["samples"])
211237
samples = find_raw_inputs(samples)
212238

0 commit comments

Comments
 (0)