Skip to content

Commit 19eb70c

Browse files
committed
WIP: Moving gpdm route to termdb.dmr.ts route. Making annotation aware dmrs paint on block instead of returnpng
1 parent a307938 commit 19eb70c

File tree

5 files changed

+160
-80
lines changed

5 files changed

+160
-80
lines changed

client/plots/gb/view/View.ts

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -307,6 +307,7 @@ export class View {
307307
arg.chr = this.state.config.geneSearchResult.chr
308308
arg.start = this.state.config.geneSearchResult.start
309309
arg.stop = this.state.config.geneSearchResult.stop
310+
if (this.state.config.hlregions?.length) arg.hlregions = this.state.config.hlregions
310311
first_genetrack_tolist(this.opts.genome, arg.tklst)
311312

312313
const _ = await import('#src/block')

client/plots/volcano/interactions/VolcanoInteractions.ts

Lines changed: 23 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -187,9 +187,10 @@ export class VolcanoInteractions {
187187
})
188188
}
189189

190-
/** When clicking on a DM data point, launches the GPDM probe-level
191-
* analysis in a new sandbox. Looks up gene coordinates via genelookup,
192-
* then calls termdb/gpdm for the region. */
190+
/** When clicking on a DM data point, runs GPDM analysis then opens a sandbox
191+
* (sibling to the volcano in the mass plotDiv) with a genome browser Block
192+
* at the gene locus and annotation-aware DMRs overlaid as highlight regions
193+
* (orange=hyper, blue=hypo). */
193194
async launchGpdm(geneName: string, promoterId?: string) {
194195
const config = this.app.getState().plots.find((p: VolcanoPlotConfig) => p.id === this.id)
195196
if (config.termType !== DNA_METHYLATION) return
@@ -198,6 +199,7 @@ export class VolcanoInteractions {
198199
const dslabel = this.app.vocabApi.vocab.dslabel
199200
const genomeObj = this.app.opts.genome
200201

202+
// Look up gene coordinates
201203
const geneResult = await dofetch3('genelookup', {
202204
body: { deep: 1, input: geneName, genome }
203205
})
@@ -229,6 +231,7 @@ export class VolcanoInteractions {
229231
return
230232
}
231233

234+
// Build hlregions — orange=hyper, blue=hypo, alpha scaled by probability
232235
const hlregions = (dmrResult.dmrs ?? []).map((dmr: any) => {
233236
const alpha = Math.round(Math.min(255, (0.5 + dmr.probability * 0.5) * 255))
234237
const hex = alpha.toString(16).padStart(2, '0')
@@ -240,10 +243,25 @@ export class VolcanoInteractions {
240243
const tklst: any[] = []
241244
first_genetrack_tolist(genomeObj, tklst)
242245
const { Block } = await import('#src/block')
243-
new Block({ holder: sandbox.body, genome: genomeObj, chr, start, stop, tklst, hlregions, nobox: true, width: 800, hidegenelegend: true })
246+
new Block({
247+
holder: sandbox.body,
248+
genome: genomeObj,
249+
chr,
250+
start,
251+
stop,
252+
tklst,
253+
hlregions,
254+
nobox: true,
255+
width: 800,
256+
hidegenelegend: true
257+
})
244258

245259
if (!dmrResult.dmrs?.length) {
246-
sandbox.body.append('div').style('padding', '6px 0').style('color', '#888').text('No significant DMRs detected in this region')
260+
sandbox.body
261+
.append('div')
262+
.style('padding', '6px 0')
263+
.style('color', '#888')
264+
.text('No significant DMRs detected in this region')
247265
}
248266
}
249267

python/src/gpdm_analysis.py

Lines changed: 84 additions & 64 deletions
Original file line numberDiff line numberDiff line change
@@ -53,9 +53,42 @@
5353
# import triggers basicConfig.
5454
logging.getLogger("gpdm").setLevel(logging.CRITICAL)
5555

56+
# Set matplotlib to non-interactive Agg backend before gpdm import,
57+
# since gpdm/core.py imports matplotlib.pyplot at module load time.
58+
import matplotlib
59+
matplotlib.use('Agg')
60+
5661
# Import the GPDM analysis class (now safe to import without stderr side-effects)
5762
from gpdm import RegionalDMAnalysis
5863

64+
# Import Query from the existing PP HDF5 reader (same python/src/ directory)
65+
from query_beta_values import Query
66+
67+
68+
def get_region_positions(h5file, chrom, start, stop):
69+
"""
70+
Read CpG genomic positions for a region from the HDF5 file.
71+
Query.process_genomic_queries() returns only the beta matrix, not positions,
72+
so this small helper reads just the meta/start array for the region.
73+
Uses the same boundary logic as Query to ensure row alignment.
74+
"""
75+
with h5py.File(h5file, 'r') as f:
76+
chrom_lengths = json.loads(f['/'].attrs['chrom_lengths'])
77+
if chrom not in chrom_lengths:
78+
return None
79+
chroms = list(chrom_lengths.keys())
80+
prefix = [0]
81+
for c in chroms:
82+
prefix.append(prefix[-1] + chrom_lengths[c])
83+
idx = chroms.index(chrom)
84+
row_start = prefix[idx]
85+
start_pos = f['meta/start'][row_start:row_start + chrom_lengths[chrom]]
86+
left = int(np.searchsorted(start_pos, start, 'left'))
87+
right = int(np.searchsorted(start_pos, stop, 'right'))
88+
if left >= right:
89+
return None
90+
return start_pos[left:right]
91+
5992

6093
def read_region_from_h5(h5file, samples, chrom, start, stop):
6194
"""
@@ -197,14 +230,16 @@ def run_gpdm(params):
197230
nan_threshold = float(params.get('nan_threshold', 0.5)) # drop probes with > 50% missing
198231
annotations = params.get('annotations', []) # regulatory domain annotations
199232

200-
# Read the HDF5 for all samples from both groups in a single pass
201-
# (more efficient than two separate reads)
233+
# Read beta matrix and positions from HDF5
234+
# Note: Query.process_genomic_queries has a bug where it uses chromosome-local
235+
# row indices to slice the dataset instead of absolute row offsets, producing
236+
# wrong data for any chromosome other than the first. read_region_from_h5
237+
# correctly computes abs_left = row_start + left before slicing.
202238
all_samples = group1 + group2
203239
positions, beta_matrix, valid_samples = read_region_from_h5(
204240
h5file, all_samples, chrom, start, stop
205241
)
206242

207-
# Validate that we have enough probes to fit a GP (minimum 3)
208243
if positions is None or len(positions) < 3:
209244
return {'error': f'Too few probes in {chrom}:{start}-{stop} (need >= 3)'}
210245

@@ -284,67 +319,51 @@ def run_gpdm(params):
284319
length_scale_bp=int(ann.get('length_scale_bp', 1000)),
285320
)
286321

287-
# --- Step 5: Run both GP models ---
288-
# method='both' fits NaiveGP and DomainPartitionedGP independently.
289-
# Results are stored in analysis.results_naive and analysis.results_annotation.
290-
# The annotation-aware model is set as the primary result (analysis.results).
291-
analysis.run(method='both')
292-
293-
# --- Step 6: Build grid response for the D3 visualization ---
294-
# to_dataframe() exports 500-point predictions aligned to a uniform grid.
295-
# Column names use the group label strings passed to load_methylation:
296-
# pred_group1, std_group1, pred_group2, std_group2
322+
# --- Step 5: Run annotation-aware GP model only ---
323+
# Skips NaiveGP to halve computation time. Results in analysis.results_annotation.
324+
analysis.run(method='annotation_aware')
325+
326+
# --- Step 5b: Write visualization PNG to cache if a path was supplied ---
327+
plot_path = params.get('plot_path')
328+
if plot_path:
329+
try:
330+
import os
331+
import matplotlib.pyplot as plt
332+
os.makedirs(os.path.dirname(plot_path), exist_ok=True)
333+
analysis.plot_results(results=analysis.results_annotation, save_path=plot_path, dark_theme=False)
334+
plt.close('all')
335+
except Exception:
336+
pass # non-fatal: analysis result still returned without image
337+
338+
# --- Step 6: Build grid response for the D3 visualization (termdb/gpdm) ---
339+
# termdb/dmr ignores this; termdb/gpdm needs it for all 4 visualization panels.
297340
grid_df = analysis.to_dataframe()
298341

299342
def safe_list(arr):
300-
"""
301-
Convert a numpy array or pandas Series to a plain Python list,
302-
replacing NaN and Inf values with None (JSON-serializable null).
303-
The D3 visualization uses null to skip drawing at missing positions.
304-
"""
305343
return [None if (np.isnan(v) or np.isinf(v)) else float(v) for v in arr]
306344

307-
# Extract the four core GP prediction arrays from the DataFrame
308-
pred_a = grid_df['pred_group1'].values # group A posterior mean
309-
std_a = grid_df['std_group1'].values # group A posterior std
310-
pred_b = grid_df['pred_group2'].values # group B posterior mean
311-
std_b = grid_df['std_group2'].values # group B posterior std
345+
pred_a = grid_df['pred_group1'].values
346+
std_a = grid_df['std_group1'].values
347+
pred_b = grid_df['pred_group2'].values
348+
std_b = grid_df['std_group2'].values
312349

313-
# Build the grid dict sent to the client.
314-
# CI bands are computed as mean ± 1.96*std (approximates 95% credible interval
315-
# for visualization purposes — the exact CI is in ci_lower/ci_upper for DMR calling).
316350
grid = {
317-
'positions': safe_list(grid_df['position']), # genomic x-axis
318-
'group_a_mean': safe_list(pred_a), # group A GP mean line
319-
'group_a_lower': safe_list(pred_a - 1.96 * std_a), # group A lower CI band
320-
'group_a_upper': safe_list(pred_a + 1.96 * std_a), # group A upper CI band
321-
'group_b_mean': safe_list(pred_b), # group B GP mean line
322-
'group_b_lower': safe_list(pred_b - 1.96 * std_b), # group B lower CI band
323-
'group_b_upper': safe_list(pred_b + 1.96 * std_b), # group B upper CI band
324-
'difference_mean': safe_list(grid_df['diff_mean']), # Delta(x) posterior mean
325-
'difference_lower': safe_list(grid_df['ci_lower']), # Delta(x) 95% CI lower
326-
'difference_upper': safe_list(grid_df['ci_upper']), # Delta(x) 95% CI upper
327-
'posterior_prob': safe_list(grid_df['prob_B_greater']),# P(group2 > group1) at each point
351+
'positions': safe_list(grid_df['position']),
352+
'group_a_mean': safe_list(pred_a),
353+
'group_a_lower': safe_list(pred_a - 1.96 * std_a),
354+
'group_a_upper': safe_list(pred_a + 1.96 * std_a),
355+
'group_b_mean': safe_list(pred_b),
356+
'group_b_lower': safe_list(pred_b - 1.96 * std_b),
357+
'group_b_upper': safe_list(pred_b + 1.96 * std_b),
358+
'difference_mean': safe_list(grid_df['diff_mean']),
359+
'difference_lower': safe_list(grid_df['ci_lower']),
360+
'difference_upper': safe_list(grid_df['ci_upper']),
361+
'posterior_prob': safe_list(grid_df['prob_B_greater']),
328362
}
329363

330-
# --- Step 7: Serialize naive DMRs ---
331-
# These come from NaiveGP: a single global kernel, no annotation priors.
332-
# Shown in the client as purple bars on the DMR track for comparison.
333-
naive_dmrs = []
334-
if analysis.results_naive and analysis.results_naive.dmrs:
335-
for d in analysis.results_naive.dmrs:
336-
naive_dmrs.append({
337-
'chr': chrom,
338-
'start': int(d.start),
339-
'stop': int(d.end),
340-
'width': int(d.width_bp),
341-
'max_delta_beta': float(d.max_delta_beta),
342-
'probability': float(d.mean_posterior_prob),
343-
})
344-
345-
# --- Step 8: Serialize annotation-aware DMRs ---
346-
# These come from DomainPartitionedGP: domain-specific priors and kernels.
347-
# Shown as orange bars — the primary result shown to the user.
364+
# --- Step 7: Serialize annotation-aware DMRs ---
365+
# max_delta_beta is always positive (absolute peak effect size).
366+
# mean_delta_beta is signed: positive = group B (group2) > group A (group1) = hyper.
348367
annot_dmrs = []
349368
if analysis.results_annotation and analysis.results_annotation.dmrs:
350369
for d in analysis.results_annotation.dmrs:
@@ -354,21 +373,22 @@ def safe_list(arr):
354373
'stop': int(d.end),
355374
'width': int(d.width_bp),
356375
'max_delta_beta': float(d.max_delta_beta),
376+
'direction': 'hyper' if d.mean_delta_beta >= 0 else 'hypo',
357377
'probability': float(d.mean_posterior_prob),
358378
})
359379

360380
return {
361381
'status': 'ok',
362-
'dmrs': annot_dmrs, # annotation-aware DMRs (primary result)
363-
'naive_dmrs': naive_dmrs, # naive DMRs (comparison reference)
364-
'grid': grid, # 500-point posterior predictions for D3
382+
'dmrs': annot_dmrs,
383+
'naive_dmrs': [], # naive model not run; kept for termdb/gpdm client compatibility
384+
'grid': grid,
365385
'metadata': {
366-
'n_probes': int(len(positions)), # probes used after filtering
367-
'n_probes_dropped': n_dropped, # probes dropped by NaN threshold
368-
'n_nan_imputed': nan_count, # individual NaN values imputed
369-
'n_samples_group1': n_g1, # group 1 sample count
370-
'n_samples_group2': n_g2, # group 2 sample count
371-
'region': f'{chrom}:{start}-{stop}', # region string for display
386+
'n_probes': int(len(positions)),
387+
'n_probes_dropped': n_dropped,
388+
'n_nan_imputed': nan_count,
389+
'n_samples_group1': n_g1,
390+
'n_samples_group2': n_g2,
391+
'region': f'{chrom}:{start}-{stop}',
372392
}
373393
}
374394

server/routes/termdb.dmr.ts

Lines changed: 35 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,19 @@
11
import type { RouteApi, TermdbDmrRequest, TermdbDmrSuccessResponse } from '#types'
22
import { TermdbDmrPayload } from '#types/checkers'
3-
import { run_R } from '@sjcrh/proteinpaint-r'
3+
// import { run_R } from '@sjcrh/proteinpaint-r' // replaced by GPDM Python analysis
4+
import { run_python } from '@sjcrh/proteinpaint-python'
45
import { invalidcoord } from '#shared/common.js'
6+
import serverconfig from '#src/serverconfig.js'
7+
import { mayLog } from '#src/helpers.ts'
8+
import { formatElapsedTime } from '#shared'
9+
import path from 'path'
10+
import fs from 'fs'
11+
import crypto from 'crypto'
12+
13+
// Ensure the gpdm cache subdirectory exists (mirrors the grin2 pattern)
14+
15+
const cachedir_gpdm = path.join(serverconfig.cachedir, 'gpdm')
16+
if (!fs.existsSync(cachedir_gpdm)) fs.mkdirSync(cachedir_gpdm, { recursive: true })
517

618
export const api: RouteApi = {
719
endpoint: 'termdb/dmr',
@@ -29,23 +41,37 @@ function init({ genomes }) {
2941

3042
if (!Array.isArray(q.group1) || q.group1.length == 0) throw new Error('group1 not non empty array')
3143
if (!Array.isArray(q.group2) || q.group2.length == 0) throw new Error('group2 not non empty array')
32-
3344
if (invalidcoord(genome, q.chr, q.start, q.stop)) throw new Error('invalid chr/start/stop')
3445

35-
const arg = {
36-
group1: q.group1,
37-
group2: q.group2,
38-
file: ds.queries.dnaMethylation.file, // todo change file to mValueFile
46+
const group1 = q.group1.map(s => s.sample).filter(Boolean)
47+
const group2 = q.group2.map(s => s.sample).filter(Boolean)
48+
if (group1.length < 3) throw new Error(`Need at least 3 samples in group1, got ${group1.length}`)
49+
if (group2.length < 3) throw new Error(`Need at least 3 samples in group2, got ${group2.length}`)
50+
51+
const plotPath = path.join(cachedir_gpdm, `dmr_${crypto.randomBytes(16).toString('hex')}.png`)
52+
53+
const gpdmInput = {
54+
h5file: ds.queries.dnaMethylation.file,
3955
chr: q.chr,
4056
start: q.start,
41-
stop: q.stop
57+
stop: q.stop,
58+
group1,
59+
group2,
60+
annotations: q.annotations || [],
61+
nan_threshold: q.nan_threshold ?? 0.5,
62+
plot_path: plotPath
4263
}
4364

44-
const result: any = JSON.parse(await run_R('dmr.R', JSON.stringify(arg)))
65+
const time1 = Date.now()
66+
const result = JSON.parse(await run_python('gpdm_analysis.py', JSON.stringify(gpdmInput)))
67+
mayLog('DMR analysis time:', formatElapsedTime(Date.now() - time1))
4568
if (result.error) throw new Error(result.error)
46-
res.send(result as TermdbDmrSuccessResponse)
69+
70+
// PNG is written to cachedir_gpdm by Python and kept there for reference
71+
res.send({ status: 'ok', dmrs: result.dmrs } as TermdbDmrSuccessResponse)
4772
} catch (e: any) {
4873
res.send({ error: e.message || e })
74+
if (e instanceof Error && e.stack) console.log(e)
4975
}
5076
}
5177
}

shared/types/src/routes/termdb.dmr.ts

Lines changed: 17 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -11,7 +11,10 @@ export type TermdbDmrRequest = {
1111
chr: string
1212
start: number
1313
stop: number
14-
// todo more params
14+
/** optional regulatory domain annotations for the GP model */
15+
annotations?: DmrAnnotation[]
16+
/** max fraction of NaN per probe before dropping (default 0.5) */
17+
nan_threshold?: number
1518
filter?: Filter
1619
__protected__?: any
1720
}
@@ -21,13 +24,25 @@ type Sample = {
2124
sample: string
2225
}
2326

27+
type DmrAnnotation = {
28+
name: string
29+
start: number
30+
end: number
31+
base_methylation?: number
32+
length_scale_bp?: number
33+
}
34+
2435
export type TermdbDmrSuccessResponse = {
2536
status: 'ok'
2637
dmrs: {
2738
chr: string
2839
start: number
2940
stop: number
30-
// todo more stats
41+
width: number
42+
max_delta_beta: number
43+
/** hyper = group2 hypermethylated relative to group1; hypo = opposite */
44+
direction: 'hyper' | 'hypo'
45+
probability: number
3146
}[]
3247
}
3348

0 commit comments

Comments
 (0)