WIP: Moving gpdm route to termdb.dmr.ts route. Making annotation aware dmrs paint on block instead of returnpng

compbiolover · compbiolover · commit 9076b22462f7 · 2026-03-12T16:21:21.000-05:00
diff --git a/client/plots/gb/view/View.ts b/client/plots/gb/view/View.ts
@@ -307,6 +307,7 @@ export class View {
 		arg.chr = this.state.config.geneSearchResult.chr
 		arg.start = this.state.config.geneSearchResult.start
 		arg.stop = this.state.config.geneSearchResult.stop
+		if (this.state.config.hlregions?.length) arg.hlregions = this.state.config.hlregions
 		first_genetrack_tolist(this.opts.genome, arg.tklst)
 
 		const _ = await import('#src/block')
diff --git a/client/plots/volcano/interactions/VolcanoInteractions.ts b/client/plots/volcano/interactions/VolcanoInteractions.ts
@@ -3,9 +3,7 @@ import { downloadTable, GeneSetEditUI, MultiTermWrapperEditUI, newSandboxDiv } f
 import { to_svg } from '#src/client'
 import type { VolcanoDom, VolcanoPlotConfig } from '../VolcanoTypes'
 import { TermTypes } from '#shared/terms.js'
-import { GpdmPlot } from '../../gpdm/GpdmPlot'
 import { dofetch3 } from '#common/dofetch'
-import { select } from 'd3-selection'
 
 export class VolcanoInteractions {
 	app: MassAppApi
@@ -191,55 +189,82 @@ export class VolcanoInteractions {
 		})
 	}
 
-	/** When clicking on a DM data point, launches the GPDM probe-level
-	 * analysis in a new sandbox. Looks up gene coordinates via genelookup,
-	 * then calls termdb/gpdm for the region. */
+	/** When clicking on a DM data point, runs GPDM analysis then opens a sandbox
+	 * (sibling to the volcano in the mass plotDiv) with a genome browser Block
+	 * at the gene locus and annotation-aware DMRs overlaid as highlight regions
+	 * (orange=hyper, blue=hypo). */
 	async launchGpdm(geneName: string, promoterId?: string) {
 		const config = this.app.getState().plots.find((p: VolcanoPlotConfig) => p.id === this.id)
 		if (config.termType !== TermTypes.DNA_METHYLATION) return
 
 		const genome = this.app.vocabApi.vocab.genome
+		const dslabel = this.app.vocabApi.vocab.dslabel
+		const genomeObj = this.app.opts.genome
 
 		// Look up gene coordinates
-		const result = await dofetch3('genelookup', {
+		const geneResult = await dofetch3('genelookup', {
 			body: { deep: 1, input: geneName, genome }
 		})
-		if (result.error || !result.gmlst || result.gmlst.length === 0) {
+		if (geneResult.error || !geneResult.gmlst || geneResult.gmlst.length === 0) {
 			window.alert(`Could not find coordinates for gene "${geneName}"`)
 			return
 		}
 
-		const gm = result.gmlst[0]
-		// Expand region by 2kb on each side to capture flanking probes
+		const gm = geneResult.gmlst[0]
 		const pad = 2000
 		const chr = gm.chr
 		const start = Math.max(0, gm.start - pad)
 		const stop = gm.stop + pad
 
-		// Build sample lists from the config's group data
 		const group1 = config.samplelst.groups[0].values || []
 		const group2 = config.samplelst.groups[1].values || []
 
-		// Open a new sandbox (PP standard pattern)
-		const sandboxParent = this.app.opts.plotDiv || select(this.dom.holder.node()!.parentNode as HTMLElement)
-		const sandbox = newSandboxDiv(sandboxParent)
-		const title = promoterId ? `GPDM: ${geneName} (${promoterId})` : `GPDM: ${geneName}`
-		sandbox.header.text(title)
+		const sandbox = newSandboxDiv(this.dom.holder)
+		sandbox.header.text(promoterId ? `DMR: ${geneName} (${promoterId})` : `DMR: ${geneName}`)
+		const waitDiv = sandbox.body.append('div').style('padding', '10px').text('Running GPDM analysis…')
 
-		new GpdmPlot({
-			holder: sandbox.body as any,
-			genome,
-			dslabel: this.app.vocabApi.vocab.dslabel,
+		const dmrResult = await dofetch3('termdb/dmr', {
+			body: { genome, dslabel, chr, start, stop, group1, group2 }
+		})
+		waitDiv.remove()
+
+		if (dmrResult.error) {
+			sandbox.body.append('div').style('padding', '10px').style('color', 'red').text(dmrResult.error)
+			return
+		}
+
+		// Build hlregions — orange=hyper, blue=hypo, alpha scaled by probability
+		const hlregions = (dmrResult.dmrs ?? []).map((dmr: any) => {
+			const alpha = Math.round(Math.min(255, (0.5 + dmr.probability * 0.5) * 255))
+			const hex = alpha.toString(16).padStart(2, '0')
+			const base = dmr.direction === 'hyper' ? '#e66101' : '#5e81f4'
+			return { chr: dmr.chr, start: dmr.start, stop: dmr.stop, color: base + hex }
+		})
+
+		const { first_genetrack_tolist } = await import('#common/1stGenetk')
+		const tklst: any[] = []
+		first_genetrack_tolist(genomeObj, tklst)
+		const { Block } = await import('#src/block')
+		new Block({
+			holder: sandbox.body,
+			genome: genomeObj,
 			chr,
 			start,
 			stop,
-			geneName,
-			promoterId,
-			group1,
-			group2,
-			group1Name: config.samplelst.groups[0].name,
-			group2Name: config.samplelst.groups[1].name
+			tklst,
+			hlregions,
+			nobox: true,
+			width: 800,
+			hidegenelegend: true
 		})
+
+		if (!dmrResult.dmrs?.length) {
+			sandbox.body
+				.append('div')
+				.style('padding', '6px 0')
+				.style('color', '#888')
+				.text('No significant DMRs detected in this region')
+		}
 	}
 
 	async launchDEGClustering() {
diff --git a/python/src/gpdm_analysis.py b/python/src/gpdm_analysis.py
@@ -53,9 +53,42 @@
 # import triggers basicConfig.
 logging.getLogger("gpdm").setLevel(logging.CRITICAL)
 
+# Set matplotlib to non-interactive Agg backend before gpdm import,
+# since gpdm/core.py imports matplotlib.pyplot at module load time.
+import matplotlib
+matplotlib.use('Agg')
+
 # Import the GPDM analysis class (now safe to import without stderr side-effects)
 from gpdm import RegionalDMAnalysis
 
+# Import Query from the existing PP HDF5 reader (same python/src/ directory)
+from query_beta_values import Query
+
+
+def get_region_positions(h5file, chrom, start, stop):
+    """
+    Read CpG genomic positions for a region from the HDF5 file.
+    Query.process_genomic_queries() returns only the beta matrix, not positions,
+    so this small helper reads just the meta/start array for the region.
+    Uses the same boundary logic as Query to ensure row alignment.
+    """
+    with h5py.File(h5file, 'r') as f:
+        chrom_lengths = json.loads(f['/'].attrs['chrom_lengths'])
+        if chrom not in chrom_lengths:
+            return None
+        chroms = list(chrom_lengths.keys())
+        prefix = [0]
+        for c in chroms:
+            prefix.append(prefix[-1] + chrom_lengths[c])
+        idx = chroms.index(chrom)
+        row_start = prefix[idx]
+        start_pos = f['meta/start'][row_start:row_start + chrom_lengths[chrom]]
+        left = int(np.searchsorted(start_pos, start, 'left'))
+        right = int(np.searchsorted(start_pos, stop, 'right'))
+        if left >= right:
+            return None
+        return start_pos[left:right]
+
 
 def read_region_from_h5(h5file, samples, chrom, start, stop):
     """
@@ -197,14 +230,16 @@ def run_gpdm(params):
     nan_threshold = float(params.get('nan_threshold', 0.5))  # drop probes with > 50% missing
     annotations = params.get('annotations', [])               # regulatory domain annotations
 
-    # Read the HDF5 for all samples from both groups in a single pass
-    # (more efficient than two separate reads)
+    # Read beta matrix and positions from HDF5
+    # Note: Query.process_genomic_queries has a bug where it uses chromosome-local
+    # row indices to slice the dataset instead of absolute row offsets, producing
+    # wrong data for any chromosome other than the first. read_region_from_h5
+    # correctly computes abs_left = row_start + left before slicing.
     all_samples = group1 + group2
     positions, beta_matrix, valid_samples = read_region_from_h5(
         h5file, all_samples, chrom, start, stop
     )
 
-    # Validate that we have enough probes to fit a GP (minimum 3)
     if positions is None or len(positions) < 3:
         return {'error': f'Too few probes in {chrom}:{start}-{stop} (need >= 3)'}
 
@@ -284,67 +319,51 @@ def run_gpdm(params):
             length_scale_bp=int(ann.get('length_scale_bp', 1000)),
         )
 
-    # --- Step 5: Run both GP models ---
-    # method='both' fits NaiveGP and DomainPartitionedGP independently.
-    # Results are stored in analysis.results_naive and analysis.results_annotation.
-    # The annotation-aware model is set as the primary result (analysis.results).
-    analysis.run(method='both')
-
-    # --- Step 6: Build grid response for the D3 visualization ---
-    # to_dataframe() exports 500-point predictions aligned to a uniform grid.
-    # Column names use the group label strings passed to load_methylation:
-    #   pred_group1, std_group1, pred_group2, std_group2
+    # --- Step 5: Run annotation-aware GP model only ---
+    # Skips NaiveGP to halve computation time. Results in analysis.results_annotation.
+    analysis.run(method='annotation_aware')
+
+    # --- Step 5b: Write visualization PNG to cache if a path was supplied ---
+    plot_path = params.get('plot_path')
+    if plot_path:
+        try:
+            import os
+            import matplotlib.pyplot as plt
+            os.makedirs(os.path.dirname(plot_path), exist_ok=True)
+            analysis.plot_results(results=analysis.results_annotation, save_path=plot_path, dark_theme=False)
+            plt.close('all')
+        except Exception:
+            pass  # non-fatal: analysis result still returned without image
+
+    # --- Step 6: Build grid response for the D3 visualization (termdb/gpdm) ---
+    # termdb/dmr ignores this; termdb/gpdm needs it for all 4 visualization panels.
     grid_df = analysis.to_dataframe()
 
     def safe_list(arr):
-        """
-        Convert a numpy array or pandas Series to a plain Python list,
-        replacing NaN and Inf values with None (JSON-serializable null).
-        The D3 visualization uses null to skip drawing at missing positions.
-        """
         return [None if (np.isnan(v) or np.isinf(v)) else float(v) for v in arr]
 
-    # Extract the four core GP prediction arrays from the DataFrame
-    pred_a = grid_df['pred_group1'].values  # group A posterior mean
-    std_a = grid_df['std_group1'].values    # group A posterior std
-    pred_b = grid_df['pred_group2'].values  # group B posterior mean
-    std_b = grid_df['std_group2'].values    # group B posterior std
+    pred_a = grid_df['pred_group1'].values
+    std_a  = grid_df['std_group1'].values
+    pred_b = grid_df['pred_group2'].values
+    std_b  = grid_df['std_group2'].values
 
-    # Build the grid dict sent to the client.
-    # CI bands are computed as mean ± 1.96*std (approximates 95% credible interval
-    # for visualization purposes — the exact CI is in ci_lower/ci_upper for DMR calling).
     grid = {
-        'positions': safe_list(grid_df['position']),          # genomic x-axis
-        'group_a_mean': safe_list(pred_a),                    # group A GP mean line
-        'group_a_lower': safe_list(pred_a - 1.96 * std_a),   # group A lower CI band
-        'group_a_upper': safe_list(pred_a + 1.96 * std_a),   # group A upper CI band
-        'group_b_mean': safe_list(pred_b),                    # group B GP mean line
-        'group_b_lower': safe_list(pred_b - 1.96 * std_b),   # group B lower CI band
-        'group_b_upper': safe_list(pred_b + 1.96 * std_b),   # group B upper CI band
-        'difference_mean': safe_list(grid_df['diff_mean']),   # Delta(x) posterior mean
-        'difference_lower': safe_list(grid_df['ci_lower']),   # Delta(x) 95% CI lower
-        'difference_upper': safe_list(grid_df['ci_upper']),   # Delta(x) 95% CI upper
-        'posterior_prob': safe_list(grid_df['prob_B_greater']),# P(group2 > group1) at each point
+        'positions':         safe_list(grid_df['position']),
+        'group_a_mean':      safe_list(pred_a),
+        'group_a_lower':     safe_list(pred_a - 1.96 * std_a),
+        'group_a_upper':     safe_list(pred_a + 1.96 * std_a),
+        'group_b_mean':      safe_list(pred_b),
+        'group_b_lower':     safe_list(pred_b - 1.96 * std_b),
+        'group_b_upper':     safe_list(pred_b + 1.96 * std_b),
+        'difference_mean':   safe_list(grid_df['diff_mean']),
+        'difference_lower':  safe_list(grid_df['ci_lower']),
+        'difference_upper':  safe_list(grid_df['ci_upper']),
+        'posterior_prob':    safe_list(grid_df['prob_B_greater']),
     }
 
-    # --- Step 7: Serialize naive DMRs ---
-    # These come from NaiveGP: a single global kernel, no annotation priors.
-    # Shown in the client as purple bars on the DMR track for comparison.
-    naive_dmrs = []
-    if analysis.results_naive and analysis.results_naive.dmrs:
-        for d in analysis.results_naive.dmrs:
-            naive_dmrs.append({
-                'chr': chrom,
-                'start': int(d.start),
-                'stop': int(d.end),
-                'width': int(d.width_bp),
-                'max_delta_beta': float(d.max_delta_beta),
-                'probability': float(d.mean_posterior_prob),
-            })
-
-    # --- Step 8: Serialize annotation-aware DMRs ---
-    # These come from DomainPartitionedGP: domain-specific priors and kernels.
-    # Shown as orange bars — the primary result shown to the user.
+    # --- Step 7: Serialize annotation-aware DMRs ---
+    # max_delta_beta is always positive (absolute peak effect size).
+    # mean_delta_beta is signed: positive = group B (group2) > group A (group1) = hyper.
     annot_dmrs = []
     if analysis.results_annotation and analysis.results_annotation.dmrs:
         for d in analysis.results_annotation.dmrs:
@@ -354,21 +373,22 @@ def safe_list(arr):
                 'stop': int(d.end),
                 'width': int(d.width_bp),
                 'max_delta_beta': float(d.max_delta_beta),
+                'direction': 'hyper' if d.mean_delta_beta >= 0 else 'hypo',
                 'probability': float(d.mean_posterior_prob),
             })
 
     return {
         'status': 'ok',
-        'dmrs': annot_dmrs,           # annotation-aware DMRs (primary result)
-        'naive_dmrs': naive_dmrs,     # naive DMRs (comparison reference)
-        'grid': grid,                 # 500-point posterior predictions for D3
+        'dmrs': annot_dmrs,
+        'naive_dmrs': [],   # naive model not run; kept for termdb/gpdm client compatibility
+        'grid': grid,
         'metadata': {
-            'n_probes': int(len(positions)),          # probes used after filtering
-            'n_probes_dropped': n_dropped,            # probes dropped by NaN threshold
-            'n_nan_imputed': nan_count,               # individual NaN values imputed
-            'n_samples_group1': n_g1,                 # group 1 sample count
-            'n_samples_group2': n_g2,                 # group 2 sample count
-            'region': f'{chrom}:{start}-{stop}',      # region string for display
+            'n_probes': int(len(positions)),
+            'n_probes_dropped': n_dropped,
+            'n_nan_imputed': nan_count,
+            'n_samples_group1': n_g1,
+            'n_samples_group2': n_g2,
+            'region': f'{chrom}:{start}-{stop}',
         }
     }
 
diff --git a/server/routes/termdb.dmr.ts b/server/routes/termdb.dmr.ts
@@ -1,7 +1,19 @@
 import type { RouteApi, TermdbDmrRequest, TermdbDmrSuccessResponse } from '#types'
 import { TermdbDmrPayload } from '#types/checkers'
-import { run_R } from '@sjcrh/proteinpaint-r'
+// import { run_R } from '@sjcrh/proteinpaint-r' // replaced by GPDM Python analysis
+import { run_python } from '@sjcrh/proteinpaint-python'
 import { invalidcoord } from '#shared/common.js'
+import serverconfig from '#src/serverconfig.js'
+import { mayLog } from '#src/helpers.ts'
+import { formatElapsedTime } from '#shared'
+import path from 'path'
+import fs from 'fs'
+import crypto from 'crypto'
+
+// Ensure the gpdm cache subdirectory exists (mirrors the grin2 pattern)
+
+const cachedir_gpdm = path.join(serverconfig.cachedir, 'gpdm')
+if (!fs.existsSync(cachedir_gpdm)) fs.mkdirSync(cachedir_gpdm, { recursive: true })
 
 export const api: RouteApi = {
 	endpoint: 'termdb/dmr',
@@ -29,23 +41,37 @@ function init({ genomes }) {
 
 			if (!Array.isArray(q.group1) || q.group1.length == 0) throw new Error('group1 not non empty array')
 			if (!Array.isArray(q.group2) || q.group2.length == 0) throw new Error('group2 not non empty array')
-
 			if (invalidcoord(genome, q.chr, q.start, q.stop)) throw new Error('invalid chr/start/stop')
 
-			const arg = {
-				group1: q.group1,
-				group2: q.group2,
-				file: ds.queries.dnaMethylation.file, // todo change file to mValueFile
+			const group1 = q.group1.map(s => s.sample).filter(Boolean)
+			const group2 = q.group2.map(s => s.sample).filter(Boolean)
+			if (group1.length < 3) throw new Error(`Need at least 3 samples in group1, got ${group1.length}`)
+			if (group2.length < 3) throw new Error(`Need at least 3 samples in group2, got ${group2.length}`)
+
+			const plotPath = path.join(cachedir_gpdm, `dmr_${crypto.randomBytes(16).toString('hex')}.png`)
+
+			const gpdmInput = {
+				h5file: ds.queries.dnaMethylation.file,
 				chr: q.chr,
 				start: q.start,
-				stop: q.stop
+				stop: q.stop,
+				group1,
+				group2,
+				annotations: q.annotations || [],
+				nan_threshold: q.nan_threshold ?? 0.5,
+				plot_path: plotPath
 			}
 
-			const result: any = JSON.parse(await run_R('dmr.R', JSON.stringify(arg)))
+			const time1 = Date.now()
+			const result = JSON.parse(await run_python('gpdm_analysis.py', JSON.stringify(gpdmInput)))
+			mayLog('DMR analysis time:', formatElapsedTime(Date.now() - time1))
 			if (result.error) throw new Error(result.error)
-			res.send(result as TermdbDmrSuccessResponse)
+
+			// PNG is written to cachedir_gpdm by Python and kept there for reference
+			res.send({ status: 'ok', dmrs: result.dmrs } as TermdbDmrSuccessResponse)
 		} catch (e: any) {
 			res.send({ error: e.message || e })
+			if (e instanceof Error && e.stack) console.log(e)
 		}
 	}
 }
diff --git a/shared/types/src/routes/termdb.dmr.ts b/shared/types/src/routes/termdb.dmr.ts