Skip to content

Commit 0c46e2e

Browse files
feat: add leech as optional GPU-accelerated CCA classifier
Add leech (rnabioco/leech) as a git submodule and integrate it as an alternative to remora for charging classification. Controlled by `classifier: "remora" | "leech"` in config (remora remains default). - Add classify_charging_leech rule with GPU support and ruleorder routing - Add GPU resource configs for leech in both LSF and SLURM profiles - Add leech installation to setup-tools.sh and pixi install-leech task - Add leech tool check in Snakefile onstart when classifier is leech - Fix pod5 missing 'deprecated' dependency from --no-deps install
1 parent b2c6f58 commit 0c46e2e

File tree

9 files changed

+111
-1
lines changed

9 files changed

+111
-1
lines changed

.gitmodules

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,3 @@
1+
[submodule "resources/leech"]
2+
path = resources/leech
3+
url = https://github.com/rnabioco/leech

cluster/lsf/config.yaml

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,10 @@ set-resources:
1818
- rebasecall:ngpu=1
1919
- rebasecall:mem_mb=24
2020
- classify_charging:mem_mb=24
21+
- classify_charging_leech:lsf_queue="gpu"
22+
- classify_charging_leech:lsf_extra="-gpu num=1:j_exclusive=yes:mode=exclusive_process"
23+
- classify_charging_leech:ngpu=1
24+
- classify_charging_leech:mem_mb=24
2125
- remora_signal_stats:mem_mb=24
2226
- inject_ubam_tags:mem_mb=16
2327
- bwa_align:mem_mb=24

cluster/slurm/config.yaml

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -40,6 +40,14 @@ set-resources:
4040
mem_mb: 24000 # 24GB
4141
cpus_per_task: 4
4242

43+
# Leech charging classification (GPU-accelerated)
44+
classify_charging_leech:
45+
slurm_partition: "gpu" # GPU partition (customize for your cluster)
46+
gres: "gpu:1" # Request 1 GPU
47+
runtime: 120 # 2 hours (faster than remora with GPU)
48+
mem_mb: 24000 # 24GB
49+
cpus_per_task: 4
50+
4351
# Tag injection from unaligned BAM
4452
inject_ubam_tags:
4553
runtime: 360 # 6 hours

config/config-base.yml

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -60,6 +60,10 @@ remora_kmer_table: null
6060
# read classification model - remora trained model to classify charged vs uncharged reads
6161
remora_cca_classifier: "resources/models/cca_classifier.pt"
6262

63+
# CCA charging classifier to use: "remora" (default) or "leech"
64+
# Leech is a GPU-accelerated alternative (requires resources/leech submodule)
65+
classifier: "remora"
66+
6367
# software tools (modkit and remora managed by pixi)
6468
# NOTE: dorado_version and dorado_model are coupled. When upgrading dorado,
6569
# check the release notes for new models:

pixi.toml

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -46,7 +46,7 @@ bottleneck = "*"
4646
scripts = ["scripts/setup-env.sh"]
4747

4848
[tasks]
49-
setup = { cmd = "bash scripts/setup-tools.sh", description = "One-time setup: install dorado, models, remora, and WarpDemuX" }
49+
setup = { cmd = "bash scripts/setup-tools.sh", description = "One-time setup: install dorado, models, remora, leech, and WarpDemuX" }
5050
dl-test-data = { cmd = "cd .tests && bash dl_test_data.sh", description = "Download test data" }
5151
# Pipeline execution
5252
dry-run = { cmd = "snakemake -n --configfile=config/config-test.yml", description = "Dry run with test config" }
@@ -62,6 +62,8 @@ collapse-ref = { cmd = "python workflow/scripts/collapse_gtrndb_fasta.py", descr
6262
lint-snakefmt = { cmd = "snakefmt --check workflow/", description = "Check Snakemake formatting" }
6363
lint-python = { cmd = "ruff check workflow/scripts/ && ruff format --check workflow/scripts/", description = "Lint and check Python formatting" }
6464
lint-yaml = { cmd = "yamllint -d \"{extends: default, rules: {line-length: {max: 120}, document-start: disable}}\" config/ .github/", description = "Lint YAML files" }
65+
# Leech tasks
66+
install-leech = { cmd = "git submodule update --init resources/leech && uv pip install -e resources/leech && CONDA_PYARROW=$(pixi list pyarrow 2>/dev/null | awk '/^pyarrow[[:space:]]/ {print $2}') && [ -n \"$CONDA_PYARROW\" ] && uv pip install --force-reinstall pyarrow==$CONDA_PYARROW || true", description = "Install leech CCA classifier from submodule" }
6567
# WarpDemuX tasks
6668
install-warpdemux = { cmd = "bash -c '[ -d resources/tools/WarpDemuX ] || git clone --recursive https://github.com/KleistLab/WarpDemuX.git resources/tools/WarpDemuX' && uv pip install -e resources/tools/WarpDemuX", description = "Clone and install WarpDemuX" }
6769
dry-run-demux = { cmd = "snakemake -n --configfile=config/config-demux-test.yml", description = "Dry run demux test" }

resources/leech

Submodule leech added at 70a0a5f

scripts/setup-tools.sh

Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -156,6 +156,8 @@ if python -c "from packaging.version import Version; exit(0 if Version('${curren
156156
else
157157
echo "Installing pod5 >= ${POD5_MIN_VERSION}..."
158158
uv pip install --no-deps "pod5>=${POD5_MIN_VERSION}"
159+
# pod5 needs 'deprecated' but --no-deps skips it
160+
uv pip install deprecated
159161
echo "Pod5 installed successfully"
160162
fi
161163

@@ -173,6 +175,24 @@ else
173175
echo "WarpDemuX installed successfully"
174176
fi
175177

178+
# ============================================================================
179+
# Leech Setup (via uv, from submodule)
180+
# ============================================================================
181+
echo "=== Checking leech ==="
182+
if python -c "import leech" 2>/dev/null; then
183+
echo "Leech already installed"
184+
else
185+
if [ -d "${REPO_ROOT}/resources/leech" ]; then
186+
echo "Installing leech from submodule (with dependencies)..."
187+
uv pip install -e "${REPO_ROOT}/resources/leech"
188+
echo "Leech installed successfully"
189+
echo "NOTE: pyarrow will be reconciled with conda in the next step"
190+
else
191+
echo "Leech submodule not found at resources/leech"
192+
echo "Run 'git submodule update --init resources/leech' to clone it"
193+
fi
194+
fi
195+
176196
# ============================================================================
177197
# Reconcile pyarrow: ensure pip hasn't overridden conda's version
178198
# ============================================================================

workflow/Snakefile

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -57,6 +57,9 @@ onstart:
5757
if config.get("warpdemux", {}).get("enabled", False):
5858
tools_to_check.append(("warpdemux", [conda_prefix]))
5959

60+
if config.get("classifier", "remora") == "leech":
61+
tools_to_check.append(("leech", [conda_prefix, os.path.join(repo_root, "resources")]))
62+
6063
for tool_name, allowed_prefixes in tools_to_check:
6164
tool_path = shutil.which(tool_name)
6265
if tool_path is None:
@@ -103,6 +106,16 @@ localrules:
103106
generate_squiggy_session,
104107

105108

109+
# Route charging classifier: prefer leech or remora based on config
110+
if config.get("classifier", "remora") == "leech":
111+
112+
ruleorder: classify_charging_leech > classify_charging
113+
114+
else:
115+
116+
ruleorder: classify_charging > classify_charging_leech
117+
118+
106119
# Conditionally include WarpDemuX demultiplexing rules
107120
if config.get("warpdemux", {}).get("enabled", False):
108121

workflow/rules/aatrnaseq-process.smk

Lines changed: 55 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -214,6 +214,61 @@ rule classify_charging:
214214
"""
215215

216216

217+
rule classify_charging_leech:
218+
"""
219+
run leech trained model to classify charged and uncharged reads
220+
GPU-accelerated alternative to remora (requires leech installed from resources/leech)
221+
222+
For EDX samples, uses the EDX-filtered POD5 to match the filtered BAM.
223+
"""
224+
input:
225+
pod5=get_classification_pod5,
226+
bam=rules.inject_ubam_tags.output.bam,
227+
output:
228+
charging_bam=os.path.join(
229+
outdir, "bam", "charging", "{sample}", "{sample}.charging.bam"
230+
),
231+
charging_bam_bai=os.path.join(
232+
outdir, "bam", "charging", "{sample}", "{sample}.charging.bam.bai"
233+
),
234+
temp_sorted_bam=temp(
235+
os.path.join(
236+
outdir, "bam", "charging", "{sample}", "{sample}.charging.bam.tmp"
237+
)
238+
),
239+
log:
240+
os.path.join(outdir, "logs", "classify_charging_leech", "{sample}"),
241+
threads: 4
242+
params:
243+
model=config["remora_cca_classifier"],
244+
shell:
245+
"""
246+
if [[ "${{CUDA_VISIBLE_DEVICES:-}}" ]]; then
247+
echo "CUDA_VISIBLE_DEVICES $CUDA_VISIBLE_DEVICES"
248+
export CUDA_VISIBLE_DEVICES
249+
fi
250+
251+
leech predict \
252+
--model {params.model} \
253+
--pod5 {input.pod5} \
254+
--bam {input.bam} \
255+
--output {output.charging_bam} \
256+
--device cuda \
257+
--motif CCAGGC \
258+
--motif-offset 3 \
259+
--reference-anchored \
260+
--workers 4 \
261+
--batch-size 512 \
262+
2>&1 | tee {log}
263+
264+
# sort the result
265+
samtools sort -@ {threads} {output.charging_bam} > {output.temp_sorted_bam}
266+
cp {output.temp_sorted_bam} {output.charging_bam}
267+
268+
samtools index {output.charging_bam}
269+
"""
270+
271+
217272
rule transfer_bam_tags:
218273
"""
219274
creates classified bam with MM and ML tags transferred to CM/CL

0 commit comments

Comments
 (0)