baysor preview multiqc integration

khersameesh24 · khersameesh24 · commit ac932eb2f0bf · 2025-10-30T17:54:31.000+01:00
diff --git a/modules/local/utility/clean_html/templates/clean_html.py b/modules/local/utility/clean_html/templates/clean_html.py
diff --git a/modules/local/utility/extract_preview_data/main.nf b/modules/local/utility/extract_preview_data/main.nf
@@ -1,14 +1,15 @@
-process CLEAN_PREVIEW_HTML {
+process EXTRACT_PREVIEW_DATA {
     tag "${meta.id}"
     label 'process_low'
 
-    container "community.wave.seqera.io/library/beautifulsoup4_procs:3f09125465990b35"
+    container "community.wave.seqera.io/library/beautifulsoup4_pandas:d3b8b3eb86514c3c"
 
     input:
     tuple val(meta), path(preview_html)
 
     output:
-    tuple val(meta), path("${prefix}/preview_mqc.html"), emit: mqc_html
+    tuple val(meta), path("${prefix}/*_mqc.tsv"), emit: mqc_data
+    tuple val(meta), path("${prefix}/*_mqc.png"), emit: mqc_img
     path ("versions.yml"), emit: versions
 
     when:
@@ -17,27 +18,31 @@ process CLEAN_PREVIEW_HTML {
     script:
     // Exit if running this module with -profile conda / -profile mamba
     if (workflow.profile.tokenize(',').intersect(['conda', 'mamba']).size() >= 1) {
-        error("CLEAN_HTML module does not support Conda. Please use Docker / Singularity / Podman instead.")
+        error("EXTRACT_PREVIEW_DATA module does not support Conda. Please use Docker / Singularity / Podman instead.")
     }
 
     prefix = task.ext.prefix ?: "${meta.id}"
 
-    template('clean_html.py')
+    template('extract_data.py')
 
     stub:
     // Exit if running this module with -profile conda / -profile mamba
     if (workflow.profile.tokenize(',').intersect(['conda', 'mamba']).size() >= 1) {
-        error("CLEAN_HTML module does not support Conda. Please use Docker / Singularity / Podman instead.")
+        error("EXTRACT_PREVIEW_DATA module does not support Conda. Please use Docker / Singularity / Podman instead.")
     }
     prefix = task.ext.prefix ?: "${meta.id}"
 
     """
     mkdir -p ${prefix}
-    touch ${prefix}/preview_mqc.html
+    touch ${prefix}/noise_distribution_mqc.tsv
+    touch ${prefix}/gene_structure_mqc.tsv
+    touch ${prefix}/umap_mqc.tsv
+    touch ${prefix}/transcript_plots_mqc.png
+    touch ${prefix}/noise_level_mqc.png
 
     cat <<-END_VERSIONS > versions.yml
     "${task.process}":
-        CLEAN_HTML: "1.0.0"
+        EXTRACT_PREVIEW_DATA: "1.0.0"
     END_VERSIONS
     """
 }
diff --git a/modules/local/utility/extract_preview_data/templates/extract_data.py b/modules/local/utility/extract_preview_data/templates/extract_data.py
@@ -0,0 +1,188 @@
+#!/usr/bin/env python3
+
+
+import re
+import sys
+import json
+import html
+import base64
+from pathlib import Path
+from typing import Dict, List, Optional, Tuple
+import pandas as pd
+from bs4 import BeautifulSoup
+
+
+def get_png_files(soup: BeautifulSoup, outdir: Path) -> None:
+    """Get png base64 images following specific h1 tags in preview.html"""
+    target_ids = ["Transcript_Plots", "Noise_Level"]
+    outdir.mkdir(parents=True, exist_ok=True)
+
+    for h1_id in target_ids:
+        h1_tag = soup.find("h1", id=h1_id)
+        if not h1_tag:
+            print(f"[WARN] No <h1> with id {h1_id} found")
+            continue
+
+        # Look for the first <img> after the h1 in the DOM
+        img_tag = h1_tag.find_next("img")
+        if not img_tag or not img_tag.get("src"):
+            print(f"[WARN] No <img> found after h1#{h1_id}")
+            continue
+
+        img_src = img_tag["src"]
+        if img_src.startswith("data:image/png;base64,"):
+            base64_data = img_src.split(",", 1)[1]
+            data = base64.b64decode(base64_data)
+        else:
+            print(f"[WARN] img src is not base64 PNG for h1#{h1_id}")
+            continue
+
+        # save png files
+        img_name = f"{h1_id}.png".lower()
+        out_path = outdir / img_name
+        with open(out_path, "wb") as f:
+            f.write(data)
+
+        print(f"[INFO] Saved {img_name}")
+
+    return None
+
+
+def extract_js_object(text: str, start_idx: int) -> Tuple[Optional[str], int]:
+    """Extract json-like object starting at start_idx."""
+    if start_idx >= len(text) or text[start_idx] != "{":
+        return None, start_idx
+
+    stack, in_str, escape, quote = [], False, False, None
+    for i in range(start_idx, len(text)):
+        ch = text[i]
+        if in_str:
+            if escape:
+                escape = False
+            elif ch == "\\":
+                escape = True
+            elif ch == quote:
+                in_str = False
+        else:
+            if ch in ('"', "'"):
+                in_str, quote = True, ch
+            elif ch == "{":
+                stack.append("{")
+            elif ch == "}":
+                stack.pop()
+                if not stack:
+                    return text[start_idx : i + 1], i + 1
+            elif ch == "/" and i + 1 < len(text):
+                # skip js comments
+                nxt = text[i + 1]
+                if nxt == "/":
+                    end = text.find("\n", i + 2)
+                    i = len(text) - 1 if end == -1 else end
+                elif nxt == "*":
+                    end = text.find("*/", i + 2)
+                    if end == -1:
+                        break
+                    i = end + 1
+
+    return None, start_idx
+
+
+def js_to_json(js: str) -> str:
+    """Convert a JS object string to valid JSON."""
+    # Remove comments
+    js = re.sub(r"/\*.*?\*/", "", js, flags=re.S)
+    js = re.sub(r"//[^\n]*", "", js)
+
+    # Convert single-quoted strings to double-quoted strings
+    js = re.sub(
+        r"'((?:\\.|[^'\\])*)'",
+        lambda m: '"' + m.group(1).replace('\"', '\\"') + '\"',
+        js
+    )
+
+    # Remove trailing commas
+    js = re.sub(r",\s*(?=[}\]])", "", js)
+    js = re.sub(r",\s*,+", ",", js)
+
+    return js.strip()
+
+
+def find_variables(script_text: str) -> Dict[str, str]:
+    """Find all 'var|let|const specN =' declarations and extract their objects."""
+    specs: Dict[str, str] = {}
+    script_text = html.unescape(script_text)
+    pattern = re.compile(r"(?:var|let|const)\s+(spec\d+)\s*=\s*{", re.I)
+
+    for match in pattern.finditer(script_text):
+        var = match.group(1)
+        obj, _ = extract_js_object(script_text, match.end() - 1)
+        if obj:
+            specs[var] = obj
+        else:
+            print(f"[WARN] Could not extract object for {var}")
+    return specs
+
+
+def write_tsvs(specs: Dict[str, str], outdir: Path) -> List[Path]:
+    """Convert extracted json to tsv."""
+    outdir.mkdir(parents=True, exist_ok=True)
+    written: List[Path] = []
+
+    for var, js_obj in specs.items():
+        try:
+            data = json.loads(js_to_json(js_obj))
+            values = data.get("data", {}).get("values", [])
+            if not values:
+                print(f"[WARN] No data.values found in {var}")
+                continue
+
+            df = pd.DataFrame(values)
+            outpath = outdir / f"{var}_mqc.tsv"
+
+            with open(outpath, "w") as f:
+                f.write("# plot_type: linegraph\n")
+                f.write(f"# section_name: {var}\n")
+                f.write("# description: Extracted preview data\n")
+                df.to_csv(f, sep="\t", index=False)
+
+            written.append(outpath)
+            print(f"[INFO] Wrote {outpath} ({len(df)} rows × {len(df.columns)} cols)")
+        except Exception as e:
+            print(f"[ERROR] Failed to process {var}: {e}")
+
+    return written
+
+
+
+if __name__ == "__main__":
+
+    input_path: Path = Path("${preview_html}")
+    outdir: Path = Path("${prefix}")
+
+    text = input_path.read_text(encoding="utf-8", errors="ignore")
+    soup = BeautifulSoup(text, "html.parser")
+
+    # get the script section
+    if "<script" in text.lower():
+        script_text = "\n".join(s.get_text() for s in soup.find_all("script"))
+    else:
+        script_text = text
+
+    spec_variables = find_variables(script_text)
+    if not spec_variables:
+        print("[ERROR] No variables (spec1, spec2, spec3) found.")
+        sys.exit(1)
+
+    # write tsv files for multiqc
+    written = write_tsvs(spec_variables, outdir)
+    if not written:
+        print("[ERROR] No TSVs written.")
+        sys.exit(1)
+
+    # get png files
+    get_png_files(soup=soup, outdir=outdir)
+
+    # write versions.yml
+    with open("versions.yml", "w") as f:
+        f.write('"${task.process}":\\n')
+        f.write('EXTRACT_PREVIEW_DATA: "1.0.0"\\n')
diff --git a/subworkflows/local/baysor_generate_preview/main.nf b/subworkflows/local/baysor_generate_preview/main.nf
@@ -4,7 +4,7 @@
 
 include { BAYSOR_PREVIEW        } from '../../../modules/local/baysor/preview/main'
 include { BAYSOR_CREATE_DATASET } from '../../../modules/local/baysor/create_dataset/main'
-include { CLEAN_PREVIEW_HTML    } from '../../../modules/local/utility/clean_html/main'
+include { EXTRACT_PREVIEW_DATA  } from '../../../modules/local/utility/extract_preview_data/main'
 include { PARQUET_TO_CSV        } from '../../../modules/local/utility/spatialconverter/parquet_to_csv/main'
 
 workflow BAYSOR_GENERATE_PREVIEW {
@@ -16,6 +16,7 @@ workflow BAYSOR_GENERATE_PREVIEW {
 
     ch_versions         = Channel.empty()
     ch_preview_mqc_html = Channel.empty()
+    ch_preview_mqc_png  = Channel.empty()
 
 
     // run parquet to csv
@@ -41,12 +42,14 @@ workflow BAYSOR_GENERATE_PREVIEW {
     ch_versions = ch_versions.mix(BAYSOR_PREVIEW.out.versions)
 
     // clean the preview html file generated
-    CLEAN_PREVIEW_HTML(BAYSOR_PREVIEW.out.preview_html)
-    ch_versions = ch_versions.mix(CLEAN_PREVIEW_HTML.out.versions)
+    EXTRACT_PREVIEW_DATA(BAYSOR_PREVIEW.out.preview_html)
+    ch_versions = ch_versions.mix(EXTRACT_PREVIEW_DATA.out.versions)
 
-    ch_preview_mqc_html = CLEAN_PREVIEW_HTML.out.mqc_html
+    ch_preview_mqc_html = EXTRACT_PREVIEW_DATA.out.mqc_data
+    ch_preview_mqc_png  = EXTRACT_PREVIEW_DATA.out.mqc_img
 
     emit:
-    preview_html = ch_preview_mqc_html // channel: [ val(meta), ["preview_mqc.html"] ]
+    preview_html = ch_preview_mqc_html // channel: [ val(meta), ["*_mqc.tsv"] ]
+    preview_img  = ch_preview_mqc_png  // channel: [ val(meta), ["*_mqc.png"] ]
     versions     = ch_versions         // channel: [ versions.yml ]
 }
diff --git a/subworkflows/local/baysor_generate_preview/meta.yml b/subworkflows/local/baysor_generate_preview/meta.yml
@@ -13,6 +13,7 @@ components:
   - baysor/preview
   - baysor/create/dataset
   - parquet/to/csv
+  - clean/preview/html
 input:
   - ch_transcripts_parquet:
       description: |