Skip to content

Commit ac932eb

Browse files
committed
baysor preview multiqc integration
1 parent df5c503 commit ac932eb

File tree

5 files changed

+210
-68
lines changed

5 files changed

+210
-68
lines changed

modules/local/utility/clean_html/templates/clean_html.py

Lines changed: 0 additions & 55 deletions
This file was deleted.
Lines changed: 13 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -1,14 +1,15 @@
1-
process CLEAN_PREVIEW_HTML {
1+
process EXTRACT_PREVIEW_DATA {
22
tag "${meta.id}"
33
label 'process_low'
44

5-
container "community.wave.seqera.io/library/beautifulsoup4_procs:3f09125465990b35"
5+
container "community.wave.seqera.io/library/beautifulsoup4_pandas:d3b8b3eb86514c3c"
66

77
input:
88
tuple val(meta), path(preview_html)
99

1010
output:
11-
tuple val(meta), path("${prefix}/preview_mqc.html"), emit: mqc_html
11+
tuple val(meta), path("${prefix}/*_mqc.tsv"), emit: mqc_data
12+
tuple val(meta), path("${prefix}/*_mqc.png"), emit: mqc_img
1213
path ("versions.yml"), emit: versions
1314

1415
when:
@@ -17,27 +18,31 @@ process CLEAN_PREVIEW_HTML {
1718
script:
1819
// Exit if running this module with -profile conda / -profile mamba
1920
if (workflow.profile.tokenize(',').intersect(['conda', 'mamba']).size() >= 1) {
20-
error("CLEAN_HTML module does not support Conda. Please use Docker / Singularity / Podman instead.")
21+
error("EXTRACT_PREVIEW_DATA module does not support Conda. Please use Docker / Singularity / Podman instead.")
2122
}
2223

2324
prefix = task.ext.prefix ?: "${meta.id}"
2425

25-
template('clean_html.py')
26+
template('extract_data.py')
2627

2728
stub:
2829
// Exit if running this module with -profile conda / -profile mamba
2930
if (workflow.profile.tokenize(',').intersect(['conda', 'mamba']).size() >= 1) {
30-
error("CLEAN_HTML module does not support Conda. Please use Docker / Singularity / Podman instead.")
31+
error("EXTRACT_PREVIEW_DATA module does not support Conda. Please use Docker / Singularity / Podman instead.")
3132
}
3233
prefix = task.ext.prefix ?: "${meta.id}"
3334

3435
"""
3536
mkdir -p ${prefix}
36-
touch ${prefix}/preview_mqc.html
37+
touch ${prefix}/noise_distribution_mqc.tsv
38+
touch ${prefix}/gene_structure_mqc.tsv
39+
touch ${prefix}/umap_mqc.tsv
40+
touch ${prefix}/transcript_plots_mqc.png
41+
touch ${prefix}/noise_level_mqc.png
3742
3843
cat <<-END_VERSIONS > versions.yml
3944
"${task.process}":
40-
CLEAN_HTML: "1.0.0"
45+
EXTRACT_PREVIEW_DATA: "1.0.0"
4146
END_VERSIONS
4247
"""
4348
}
Lines changed: 188 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,188 @@
1+
#!/usr/bin/env python3
2+
3+
4+
import re
5+
import sys
6+
import json
7+
import html
8+
import base64
9+
from pathlib import Path
10+
from typing import Dict, List, Optional, Tuple
11+
import pandas as pd
12+
from bs4 import BeautifulSoup
13+
14+
15+
def get_png_files(soup: BeautifulSoup, outdir: Path) -> None:
16+
"""Get png base64 images following specific h1 tags in preview.html"""
17+
target_ids = ["Transcript_Plots", "Noise_Level"]
18+
outdir.mkdir(parents=True, exist_ok=True)
19+
20+
for h1_id in target_ids:
21+
h1_tag = soup.find("h1", id=h1_id)
22+
if not h1_tag:
23+
print(f"[WARN] No <h1> with id {h1_id} found")
24+
continue
25+
26+
# Look for the first <img> after the h1 in the DOM
27+
img_tag = h1_tag.find_next("img")
28+
if not img_tag or not img_tag.get("src"):
29+
print(f"[WARN] No <img> found after h1#{h1_id}")
30+
continue
31+
32+
img_src = img_tag["src"]
33+
if img_src.startswith("data:image/png;base64,"):
34+
base64_data = img_src.split(",", 1)[1]
35+
data = base64.b64decode(base64_data)
36+
else:
37+
print(f"[WARN] img src is not base64 PNG for h1#{h1_id}")
38+
continue
39+
40+
# save png files
41+
img_name = f"{h1_id}.png".lower()
42+
out_path = outdir / img_name
43+
with open(out_path, "wb") as f:
44+
f.write(data)
45+
46+
print(f"[INFO] Saved {img_name}")
47+
48+
return None
49+
50+
51+
def extract_js_object(text: str, start_idx: int) -> Tuple[Optional[str], int]:
52+
"""Extract json-like object starting at start_idx."""
53+
if start_idx >= len(text) or text[start_idx] != "{":
54+
return None, start_idx
55+
56+
stack, in_str, escape, quote = [], False, False, None
57+
for i in range(start_idx, len(text)):
58+
ch = text[i]
59+
if in_str:
60+
if escape:
61+
escape = False
62+
elif ch == "\\":
63+
escape = True
64+
elif ch == quote:
65+
in_str = False
66+
else:
67+
if ch in ('"', "'"):
68+
in_str, quote = True, ch
69+
elif ch == "{":
70+
stack.append("{")
71+
elif ch == "}":
72+
stack.pop()
73+
if not stack:
74+
return text[start_idx : i + 1], i + 1
75+
elif ch == "/" and i + 1 < len(text):
76+
# skip js comments
77+
nxt = text[i + 1]
78+
if nxt == "/":
79+
end = text.find("\n", i + 2)
80+
i = len(text) - 1 if end == -1 else end
81+
elif nxt == "*":
82+
end = text.find("*/", i + 2)
83+
if end == -1:
84+
break
85+
i = end + 1
86+
87+
return None, start_idx
88+
89+
90+
def js_to_json(js: str) -> str:
91+
"""Convert a JS object string to valid JSON."""
92+
# Remove comments
93+
js = re.sub(r"/\*.*?\*/", "", js, flags=re.S)
94+
js = re.sub(r"//[^\n]*", "", js)
95+
96+
# Convert single-quoted strings to double-quoted strings
97+
js = re.sub(
98+
r"'((?:\\.|[^'\\])*)'",
99+
lambda m: '"' + m.group(1).replace('\"', '\\"') + '\"',
100+
js
101+
)
102+
103+
# Remove trailing commas
104+
js = re.sub(r",\s*(?=[}\]])", "", js)
105+
js = re.sub(r",\s*,+", ",", js)
106+
107+
return js.strip()
108+
109+
110+
def find_variables(script_text: str) -> Dict[str, str]:
111+
"""Find all 'var|let|const specN =' declarations and extract their objects."""
112+
specs: Dict[str, str] = {}
113+
script_text = html.unescape(script_text)
114+
pattern = re.compile(r"(?:var|let|const)\s+(spec\d+)\s*=\s*{", re.I)
115+
116+
for match in pattern.finditer(script_text):
117+
var = match.group(1)
118+
obj, _ = extract_js_object(script_text, match.end() - 1)
119+
if obj:
120+
specs[var] = obj
121+
else:
122+
print(f"[WARN] Could not extract object for {var}")
123+
return specs
124+
125+
126+
def write_tsvs(specs: Dict[str, str], outdir: Path) -> List[Path]:
127+
"""Convert extracted json to tsv."""
128+
outdir.mkdir(parents=True, exist_ok=True)
129+
written: List[Path] = []
130+
131+
for var, js_obj in specs.items():
132+
try:
133+
data = json.loads(js_to_json(js_obj))
134+
values = data.get("data", {}).get("values", [])
135+
if not values:
136+
print(f"[WARN] No data.values found in {var}")
137+
continue
138+
139+
df = pd.DataFrame(values)
140+
outpath = outdir / f"{var}_mqc.tsv"
141+
142+
with open(outpath, "w") as f:
143+
f.write("# plot_type: linegraph\n")
144+
f.write(f"# section_name: {var}\n")
145+
f.write("# description: Extracted preview data\n")
146+
df.to_csv(f, sep="\t", index=False)
147+
148+
written.append(outpath)
149+
print(f"[INFO] Wrote {outpath} ({len(df)} rows × {len(df.columns)} cols)")
150+
except Exception as e:
151+
print(f"[ERROR] Failed to process {var}: {e}")
152+
153+
return written
154+
155+
156+
157+
if __name__ == "__main__":
158+
159+
input_path: Path = Path("${preview_html}")
160+
outdir: Path = Path("${prefix}")
161+
162+
text = input_path.read_text(encoding="utf-8", errors="ignore")
163+
soup = BeautifulSoup(text, "html.parser")
164+
165+
# get the script section
166+
if "<script" in text.lower():
167+
script_text = "\n".join(s.get_text() for s in soup.find_all("script"))
168+
else:
169+
script_text = text
170+
171+
spec_variables = find_variables(script_text)
172+
if not spec_variables:
173+
print("[ERROR] No variables (spec1, spec2, spec3) found.")
174+
sys.exit(1)
175+
176+
# write tsv files for multiqc
177+
written = write_tsvs(spec_variables, outdir)
178+
if not written:
179+
print("[ERROR] No TSVs written.")
180+
sys.exit(1)
181+
182+
# get png files
183+
get_png_files(soup=soup, outdir=outdir)
184+
185+
# write versions.yml
186+
with open("versions.yml", "w") as f:
187+
f.write('"${task.process}":\\n')
188+
f.write('EXTRACT_PREVIEW_DATA: "1.0.0"\\n')

subworkflows/local/baysor_generate_preview/main.nf

Lines changed: 8 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,7 @@
44

55
include { BAYSOR_PREVIEW } from '../../../modules/local/baysor/preview/main'
66
include { BAYSOR_CREATE_DATASET } from '../../../modules/local/baysor/create_dataset/main'
7-
include { CLEAN_PREVIEW_HTML } from '../../../modules/local/utility/clean_html/main'
7+
include { EXTRACT_PREVIEW_DATA } from '../../../modules/local/utility/extract_preview_data/main'
88
include { PARQUET_TO_CSV } from '../../../modules/local/utility/spatialconverter/parquet_to_csv/main'
99

1010
workflow BAYSOR_GENERATE_PREVIEW {
@@ -16,6 +16,7 @@ workflow BAYSOR_GENERATE_PREVIEW {
1616

1717
ch_versions = Channel.empty()
1818
ch_preview_mqc_html = Channel.empty()
19+
ch_preview_mqc_png = Channel.empty()
1920

2021

2122
// run parquet to csv
@@ -41,12 +42,14 @@ workflow BAYSOR_GENERATE_PREVIEW {
4142
ch_versions = ch_versions.mix(BAYSOR_PREVIEW.out.versions)
4243

4344
// clean the preview html file generated
44-
CLEAN_PREVIEW_HTML(BAYSOR_PREVIEW.out.preview_html)
45-
ch_versions = ch_versions.mix(CLEAN_PREVIEW_HTML.out.versions)
45+
EXTRACT_PREVIEW_DATA(BAYSOR_PREVIEW.out.preview_html)
46+
ch_versions = ch_versions.mix(EXTRACT_PREVIEW_DATA.out.versions)
4647

47-
ch_preview_mqc_html = CLEAN_PREVIEW_HTML.out.mqc_html
48+
ch_preview_mqc_html = EXTRACT_PREVIEW_DATA.out.mqc_data
49+
ch_preview_mqc_png = EXTRACT_PREVIEW_DATA.out.mqc_img
4850

4951
emit:
50-
preview_html = ch_preview_mqc_html // channel: [ val(meta), ["preview_mqc.html"] ]
52+
preview_html = ch_preview_mqc_html // channel: [ val(meta), ["*_mqc.tsv"] ]
53+
preview_img = ch_preview_mqc_png // channel: [ val(meta), ["*_mqc.png"] ]
5154
versions = ch_versions // channel: [ versions.yml ]
5255
}

subworkflows/local/baysor_generate_preview/meta.yml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,7 @@ components:
1313
- baysor/preview
1414
- baysor/create/dataset
1515
- parquet/to/csv
16+
- clean/preview/html
1617
input:
1718
- ch_transcripts_parquet:
1819
description: |

0 commit comments

Comments
 (0)