Skip to content

Commit 9076b22

Browse files
committed
WIP: Moving gpdm route to termdb.dmr.ts route. Making annotation aware dmrs paint on block instead of returnpng
1 parent ca15b27 commit 9076b22

File tree

5 files changed

+187
-100
lines changed

5 files changed

+187
-100
lines changed

client/plots/gb/view/View.ts

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -307,6 +307,7 @@ export class View {
307307
arg.chr = this.state.config.geneSearchResult.chr
308308
arg.start = this.state.config.geneSearchResult.start
309309
arg.stop = this.state.config.geneSearchResult.stop
310+
if (this.state.config.hlregions?.length) arg.hlregions = this.state.config.hlregions
310311
first_genetrack_tolist(this.opts.genome, arg.tklst)
311312

312313
const _ = await import('#src/block')

client/plots/volcano/interactions/VolcanoInteractions.ts

Lines changed: 50 additions & 25 deletions
Original file line numberDiff line numberDiff line change
@@ -3,9 +3,7 @@ import { downloadTable, GeneSetEditUI, MultiTermWrapperEditUI, newSandboxDiv } f
33
import { to_svg } from '#src/client'
44
import type { VolcanoDom, VolcanoPlotConfig } from '../VolcanoTypes'
55
import { TermTypes } from '#shared/terms.js'
6-
import { GpdmPlot } from '../../gpdm/GpdmPlot'
76
import { dofetch3 } from '#common/dofetch'
8-
import { select } from 'd3-selection'
97

108
export class VolcanoInteractions {
119
app: MassAppApi
@@ -191,55 +189,82 @@ export class VolcanoInteractions {
191189
})
192190
}
193191

194-
/** When clicking on a DM data point, launches the GPDM probe-level
195-
* analysis in a new sandbox. Looks up gene coordinates via genelookup,
196-
* then calls termdb/gpdm for the region. */
192+
/** When clicking on a DM data point, runs GPDM analysis then opens a sandbox
193+
* (sibling to the volcano in the mass plotDiv) with a genome browser Block
194+
* at the gene locus and annotation-aware DMRs overlaid as highlight regions
195+
* (orange=hyper, blue=hypo). */
197196
async launchGpdm(geneName: string, promoterId?: string) {
198197
const config = this.app.getState().plots.find((p: VolcanoPlotConfig) => p.id === this.id)
199198
if (config.termType !== TermTypes.DNA_METHYLATION) return
200199

201200
const genome = this.app.vocabApi.vocab.genome
201+
const dslabel = this.app.vocabApi.vocab.dslabel
202+
const genomeObj = this.app.opts.genome
202203

203204
// Look up gene coordinates
204-
const result = await dofetch3('genelookup', {
205+
const geneResult = await dofetch3('genelookup', {
205206
body: { deep: 1, input: geneName, genome }
206207
})
207-
if (result.error || !result.gmlst || result.gmlst.length === 0) {
208+
if (geneResult.error || !geneResult.gmlst || geneResult.gmlst.length === 0) {
208209
window.alert(`Could not find coordinates for gene "${geneName}"`)
209210
return
210211
}
211212

212-
const gm = result.gmlst[0]
213-
// Expand region by 2kb on each side to capture flanking probes
213+
const gm = geneResult.gmlst[0]
214214
const pad = 2000
215215
const chr = gm.chr
216216
const start = Math.max(0, gm.start - pad)
217217
const stop = gm.stop + pad
218218

219-
// Build sample lists from the config's group data
220219
const group1 = config.samplelst.groups[0].values || []
221220
const group2 = config.samplelst.groups[1].values || []
222221

223-
// Open a new sandbox (PP standard pattern)
224-
const sandboxParent = this.app.opts.plotDiv || select(this.dom.holder.node()!.parentNode as HTMLElement)
225-
const sandbox = newSandboxDiv(sandboxParent)
226-
const title = promoterId ? `GPDM: ${geneName} (${promoterId})` : `GPDM: ${geneName}`
227-
sandbox.header.text(title)
222+
const sandbox = newSandboxDiv(this.dom.holder)
223+
sandbox.header.text(promoterId ? `DMR: ${geneName} (${promoterId})` : `DMR: ${geneName}`)
224+
const waitDiv = sandbox.body.append('div').style('padding', '10px').text('Running GPDM analysis…')
228225

229-
new GpdmPlot({
230-
holder: sandbox.body as any,
231-
genome,
232-
dslabel: this.app.vocabApi.vocab.dslabel,
226+
const dmrResult = await dofetch3('termdb/dmr', {
227+
body: { genome, dslabel, chr, start, stop, group1, group2 }
228+
})
229+
waitDiv.remove()
230+
231+
if (dmrResult.error) {
232+
sandbox.body.append('div').style('padding', '10px').style('color', 'red').text(dmrResult.error)
233+
return
234+
}
235+
236+
// Build hlregions — orange=hyper, blue=hypo, alpha scaled by probability
237+
const hlregions = (dmrResult.dmrs ?? []).map((dmr: any) => {
238+
const alpha = Math.round(Math.min(255, (0.5 + dmr.probability * 0.5) * 255))
239+
const hex = alpha.toString(16).padStart(2, '0')
240+
const base = dmr.direction === 'hyper' ? '#e66101' : '#5e81f4'
241+
return { chr: dmr.chr, start: dmr.start, stop: dmr.stop, color: base + hex }
242+
})
243+
244+
const { first_genetrack_tolist } = await import('#common/1stGenetk')
245+
const tklst: any[] = []
246+
first_genetrack_tolist(genomeObj, tklst)
247+
const { Block } = await import('#src/block')
248+
new Block({
249+
holder: sandbox.body,
250+
genome: genomeObj,
233251
chr,
234252
start,
235253
stop,
236-
geneName,
237-
promoterId,
238-
group1,
239-
group2,
240-
group1Name: config.samplelst.groups[0].name,
241-
group2Name: config.samplelst.groups[1].name
254+
tklst,
255+
hlregions,
256+
nobox: true,
257+
width: 800,
258+
hidegenelegend: true
242259
})
260+
261+
if (!dmrResult.dmrs?.length) {
262+
sandbox.body
263+
.append('div')
264+
.style('padding', '6px 0')
265+
.style('color', '#888')
266+
.text('No significant DMRs detected in this region')
267+
}
243268
}
244269

245270
async launchDEGClustering() {

python/src/gpdm_analysis.py

Lines changed: 84 additions & 64 deletions
Original file line numberDiff line numberDiff line change
@@ -53,9 +53,42 @@
5353
# import triggers basicConfig.
5454
logging.getLogger("gpdm").setLevel(logging.CRITICAL)
5555

56+
# Set matplotlib to non-interactive Agg backend before gpdm import,
57+
# since gpdm/core.py imports matplotlib.pyplot at module load time.
58+
import matplotlib
59+
matplotlib.use('Agg')
60+
5661
# Import the GPDM analysis class (now safe to import without stderr side-effects)
5762
from gpdm import RegionalDMAnalysis
5863

64+
# Import Query from the existing PP HDF5 reader (same python/src/ directory)
65+
from query_beta_values import Query
66+
67+
68+
def get_region_positions(h5file, chrom, start, stop):
69+
"""
70+
Read CpG genomic positions for a region from the HDF5 file.
71+
Query.process_genomic_queries() returns only the beta matrix, not positions,
72+
so this small helper reads just the meta/start array for the region.
73+
Uses the same boundary logic as Query to ensure row alignment.
74+
"""
75+
with h5py.File(h5file, 'r') as f:
76+
chrom_lengths = json.loads(f['/'].attrs['chrom_lengths'])
77+
if chrom not in chrom_lengths:
78+
return None
79+
chroms = list(chrom_lengths.keys())
80+
prefix = [0]
81+
for c in chroms:
82+
prefix.append(prefix[-1] + chrom_lengths[c])
83+
idx = chroms.index(chrom)
84+
row_start = prefix[idx]
85+
start_pos = f['meta/start'][row_start:row_start + chrom_lengths[chrom]]
86+
left = int(np.searchsorted(start_pos, start, 'left'))
87+
right = int(np.searchsorted(start_pos, stop, 'right'))
88+
if left >= right:
89+
return None
90+
return start_pos[left:right]
91+
5992

6093
def read_region_from_h5(h5file, samples, chrom, start, stop):
6194
"""
@@ -197,14 +230,16 @@ def run_gpdm(params):
197230
nan_threshold = float(params.get('nan_threshold', 0.5)) # drop probes with > 50% missing
198231
annotations = params.get('annotations', []) # regulatory domain annotations
199232

200-
# Read the HDF5 for all samples from both groups in a single pass
201-
# (more efficient than two separate reads)
233+
# Read beta matrix and positions from HDF5
234+
# Note: Query.process_genomic_queries has a bug where it uses chromosome-local
235+
# row indices to slice the dataset instead of absolute row offsets, producing
236+
# wrong data for any chromosome other than the first. read_region_from_h5
237+
# correctly computes abs_left = row_start + left before slicing.
202238
all_samples = group1 + group2
203239
positions, beta_matrix, valid_samples = read_region_from_h5(
204240
h5file, all_samples, chrom, start, stop
205241
)
206242

207-
# Validate that we have enough probes to fit a GP (minimum 3)
208243
if positions is None or len(positions) < 3:
209244
return {'error': f'Too few probes in {chrom}:{start}-{stop} (need >= 3)'}
210245

@@ -284,67 +319,51 @@ def run_gpdm(params):
284319
length_scale_bp=int(ann.get('length_scale_bp', 1000)),
285320
)
286321

287-
# --- Step 5: Run both GP models ---
288-
# method='both' fits NaiveGP and DomainPartitionedGP independently.
289-
# Results are stored in analysis.results_naive and analysis.results_annotation.
290-
# The annotation-aware model is set as the primary result (analysis.results).
291-
analysis.run(method='both')
292-
293-
# --- Step 6: Build grid response for the D3 visualization ---
294-
# to_dataframe() exports 500-point predictions aligned to a uniform grid.
295-
# Column names use the group label strings passed to load_methylation:
296-
# pred_group1, std_group1, pred_group2, std_group2
322+
# --- Step 5: Run annotation-aware GP model only ---
323+
# Skips NaiveGP to halve computation time. Results in analysis.results_annotation.
324+
analysis.run(method='annotation_aware')
325+
326+
# --- Step 5b: Write visualization PNG to cache if a path was supplied ---
327+
plot_path = params.get('plot_path')
328+
if plot_path:
329+
try:
330+
import os
331+
import matplotlib.pyplot as plt
332+
os.makedirs(os.path.dirname(plot_path), exist_ok=True)
333+
analysis.plot_results(results=analysis.results_annotation, save_path=plot_path, dark_theme=False)
334+
plt.close('all')
335+
except Exception:
336+
pass # non-fatal: analysis result still returned without image
337+
338+
# --- Step 6: Build grid response for the D3 visualization (termdb/gpdm) ---
339+
# termdb/dmr ignores this; termdb/gpdm needs it for all 4 visualization panels.
297340
grid_df = analysis.to_dataframe()
298341

299342
def safe_list(arr):
300-
"""
301-
Convert a numpy array or pandas Series to a plain Python list,
302-
replacing NaN and Inf values with None (JSON-serializable null).
303-
The D3 visualization uses null to skip drawing at missing positions.
304-
"""
305343
return [None if (np.isnan(v) or np.isinf(v)) else float(v) for v in arr]
306344

307-
# Extract the four core GP prediction arrays from the DataFrame
308-
pred_a = grid_df['pred_group1'].values # group A posterior mean
309-
std_a = grid_df['std_group1'].values # group A posterior std
310-
pred_b = grid_df['pred_group2'].values # group B posterior mean
311-
std_b = grid_df['std_group2'].values # group B posterior std
345+
pred_a = grid_df['pred_group1'].values
346+
std_a = grid_df['std_group1'].values
347+
pred_b = grid_df['pred_group2'].values
348+
std_b = grid_df['std_group2'].values
312349

313-
# Build the grid dict sent to the client.
314-
# CI bands are computed as mean ± 1.96*std (approximates 95% credible interval
315-
# for visualization purposes — the exact CI is in ci_lower/ci_upper for DMR calling).
316350
grid = {
317-
'positions': safe_list(grid_df['position']), # genomic x-axis
318-
'group_a_mean': safe_list(pred_a), # group A GP mean line
319-
'group_a_lower': safe_list(pred_a - 1.96 * std_a), # group A lower CI band
320-
'group_a_upper': safe_list(pred_a + 1.96 * std_a), # group A upper CI band
321-
'group_b_mean': safe_list(pred_b), # group B GP mean line
322-
'group_b_lower': safe_list(pred_b - 1.96 * std_b), # group B lower CI band
323-
'group_b_upper': safe_list(pred_b + 1.96 * std_b), # group B upper CI band
324-
'difference_mean': safe_list(grid_df['diff_mean']), # Delta(x) posterior mean
325-
'difference_lower': safe_list(grid_df['ci_lower']), # Delta(x) 95% CI lower
326-
'difference_upper': safe_list(grid_df['ci_upper']), # Delta(x) 95% CI upper
327-
'posterior_prob': safe_list(grid_df['prob_B_greater']),# P(group2 > group1) at each point
351+
'positions': safe_list(grid_df['position']),
352+
'group_a_mean': safe_list(pred_a),
353+
'group_a_lower': safe_list(pred_a - 1.96 * std_a),
354+
'group_a_upper': safe_list(pred_a + 1.96 * std_a),
355+
'group_b_mean': safe_list(pred_b),
356+
'group_b_lower': safe_list(pred_b - 1.96 * std_b),
357+
'group_b_upper': safe_list(pred_b + 1.96 * std_b),
358+
'difference_mean': safe_list(grid_df['diff_mean']),
359+
'difference_lower': safe_list(grid_df['ci_lower']),
360+
'difference_upper': safe_list(grid_df['ci_upper']),
361+
'posterior_prob': safe_list(grid_df['prob_B_greater']),
328362
}
329363

330-
# --- Step 7: Serialize naive DMRs ---
331-
# These come from NaiveGP: a single global kernel, no annotation priors.
332-
# Shown in the client as purple bars on the DMR track for comparison.
333-
naive_dmrs = []
334-
if analysis.results_naive and analysis.results_naive.dmrs:
335-
for d in analysis.results_naive.dmrs:
336-
naive_dmrs.append({
337-
'chr': chrom,
338-
'start': int(d.start),
339-
'stop': int(d.end),
340-
'width': int(d.width_bp),
341-
'max_delta_beta': float(d.max_delta_beta),
342-
'probability': float(d.mean_posterior_prob),
343-
})
344-
345-
# --- Step 8: Serialize annotation-aware DMRs ---
346-
# These come from DomainPartitionedGP: domain-specific priors and kernels.
347-
# Shown as orange bars — the primary result shown to the user.
364+
# --- Step 7: Serialize annotation-aware DMRs ---
365+
# max_delta_beta is always positive (absolute peak effect size).
366+
# mean_delta_beta is signed: positive = group B (group2) > group A (group1) = hyper.
348367
annot_dmrs = []
349368
if analysis.results_annotation and analysis.results_annotation.dmrs:
350369
for d in analysis.results_annotation.dmrs:
@@ -354,21 +373,22 @@ def safe_list(arr):
354373
'stop': int(d.end),
355374
'width': int(d.width_bp),
356375
'max_delta_beta': float(d.max_delta_beta),
376+
'direction': 'hyper' if d.mean_delta_beta >= 0 else 'hypo',
357377
'probability': float(d.mean_posterior_prob),
358378
})
359379

360380
return {
361381
'status': 'ok',
362-
'dmrs': annot_dmrs, # annotation-aware DMRs (primary result)
363-
'naive_dmrs': naive_dmrs, # naive DMRs (comparison reference)
364-
'grid': grid, # 500-point posterior predictions for D3
382+
'dmrs': annot_dmrs,
383+
'naive_dmrs': [], # naive model not run; kept for termdb/gpdm client compatibility
384+
'grid': grid,
365385
'metadata': {
366-
'n_probes': int(len(positions)), # probes used after filtering
367-
'n_probes_dropped': n_dropped, # probes dropped by NaN threshold
368-
'n_nan_imputed': nan_count, # individual NaN values imputed
369-
'n_samples_group1': n_g1, # group 1 sample count
370-
'n_samples_group2': n_g2, # group 2 sample count
371-
'region': f'{chrom}:{start}-{stop}', # region string for display
386+
'n_probes': int(len(positions)),
387+
'n_probes_dropped': n_dropped,
388+
'n_nan_imputed': nan_count,
389+
'n_samples_group1': n_g1,
390+
'n_samples_group2': n_g2,
391+
'region': f'{chrom}:{start}-{stop}',
372392
}
373393
}
374394

server/routes/termdb.dmr.ts

Lines changed: 35 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,19 @@
11
import type { RouteApi, TermdbDmrRequest, TermdbDmrSuccessResponse } from '#types'
22
import { TermdbDmrPayload } from '#types/checkers'
3-
import { run_R } from '@sjcrh/proteinpaint-r'
3+
// import { run_R } from '@sjcrh/proteinpaint-r' // replaced by GPDM Python analysis
4+
import { run_python } from '@sjcrh/proteinpaint-python'
45
import { invalidcoord } from '#shared/common.js'
6+
import serverconfig from '#src/serverconfig.js'
7+
import { mayLog } from '#src/helpers.ts'
8+
import { formatElapsedTime } from '#shared'
9+
import path from 'path'
10+
import fs from 'fs'
11+
import crypto from 'crypto'
12+
13+
// Ensure the gpdm cache subdirectory exists (mirrors the grin2 pattern)
14+
15+
const cachedir_gpdm = path.join(serverconfig.cachedir, 'gpdm')
16+
if (!fs.existsSync(cachedir_gpdm)) fs.mkdirSync(cachedir_gpdm, { recursive: true })
517

618
export const api: RouteApi = {
719
endpoint: 'termdb/dmr',
@@ -29,23 +41,37 @@ function init({ genomes }) {
2941

3042
if (!Array.isArray(q.group1) || q.group1.length == 0) throw new Error('group1 not non empty array')
3143
if (!Array.isArray(q.group2) || q.group2.length == 0) throw new Error('group2 not non empty array')
32-
3344
if (invalidcoord(genome, q.chr, q.start, q.stop)) throw new Error('invalid chr/start/stop')
3445

35-
const arg = {
36-
group1: q.group1,
37-
group2: q.group2,
38-
file: ds.queries.dnaMethylation.file, // todo change file to mValueFile
46+
const group1 = q.group1.map(s => s.sample).filter(Boolean)
47+
const group2 = q.group2.map(s => s.sample).filter(Boolean)
48+
if (group1.length < 3) throw new Error(`Need at least 3 samples in group1, got ${group1.length}`)
49+
if (group2.length < 3) throw new Error(`Need at least 3 samples in group2, got ${group2.length}`)
50+
51+
const plotPath = path.join(cachedir_gpdm, `dmr_${crypto.randomBytes(16).toString('hex')}.png`)
52+
53+
const gpdmInput = {
54+
h5file: ds.queries.dnaMethylation.file,
3955
chr: q.chr,
4056
start: q.start,
41-
stop: q.stop
57+
stop: q.stop,
58+
group1,
59+
group2,
60+
annotations: q.annotations || [],
61+
nan_threshold: q.nan_threshold ?? 0.5,
62+
plot_path: plotPath
4263
}
4364

44-
const result: any = JSON.parse(await run_R('dmr.R', JSON.stringify(arg)))
65+
const time1 = Date.now()
66+
const result = JSON.parse(await run_python('gpdm_analysis.py', JSON.stringify(gpdmInput)))
67+
mayLog('DMR analysis time:', formatElapsedTime(Date.now() - time1))
4568
if (result.error) throw new Error(result.error)
46-
res.send(result as TermdbDmrSuccessResponse)
69+
70+
// PNG is written to cachedir_gpdm by Python and kept there for reference
71+
res.send({ status: 'ok', dmrs: result.dmrs } as TermdbDmrSuccessResponse)
4772
} catch (e: any) {
4873
res.send({ error: e.message || e })
74+
if (e instanceof Error && e.stack) console.log(e)
4975
}
5076
}
5177
}

0 commit comments

Comments
 (0)