5353# import triggers basicConfig.
5454logging .getLogger ("gpdm" ).setLevel (logging .CRITICAL )
5555
56+ # Set matplotlib to non-interactive Agg backend before gpdm import,
57+ # since gpdm/core.py imports matplotlib.pyplot at module load time.
58+ import matplotlib
59+ matplotlib .use ('Agg' )
60+
5661# Import the GPDM analysis class (now safe to import without stderr side-effects)
5762from gpdm import RegionalDMAnalysis
5863
64+ # Import Query from the existing PP HDF5 reader (same python/src/ directory)
65+ from query_beta_values import Query
66+
67+
68+ def get_region_positions (h5file , chrom , start , stop ):
69+ """
70+ Read CpG genomic positions for a region from the HDF5 file.
71+ Query.process_genomic_queries() returns only the beta matrix, not positions,
72+ so this small helper reads just the meta/start array for the region.
73+ Uses the same boundary logic as Query to ensure row alignment.
74+ """
75+ with h5py .File (h5file , 'r' ) as f :
76+ chrom_lengths = json .loads (f ['/' ].attrs ['chrom_lengths' ])
77+ if chrom not in chrom_lengths :
78+ return None
79+ chroms = list (chrom_lengths .keys ())
80+ prefix = [0 ]
81+ for c in chroms :
82+ prefix .append (prefix [- 1 ] + chrom_lengths [c ])
83+ idx = chroms .index (chrom )
84+ row_start = prefix [idx ]
85+ start_pos = f ['meta/start' ][row_start :row_start + chrom_lengths [chrom ]]
86+ left = int (np .searchsorted (start_pos , start , 'left' ))
87+ right = int (np .searchsorted (start_pos , stop , 'right' ))
88+ if left >= right :
89+ return None
90+ return start_pos [left :right ]
91+
5992
6093def read_region_from_h5 (h5file , samples , chrom , start , stop ):
6194 """
@@ -197,14 +230,16 @@ def run_gpdm(params):
197230 nan_threshold = float (params .get ('nan_threshold' , 0.5 )) # drop probes with > 50% missing
198231 annotations = params .get ('annotations' , []) # regulatory domain annotations
199232
200- # Read the HDF5 for all samples from both groups in a single pass
201- # (more efficient than two separate reads)
233+ # Read beta matrix and positions from HDF5
234+ # Note: Query.process_genomic_queries has a bug where it uses chromosome-local
235+ # row indices to slice the dataset instead of absolute row offsets, producing
236+ # wrong data for any chromosome other than the first. read_region_from_h5
237+ # correctly computes abs_left = row_start + left before slicing.
202238 all_samples = group1 + group2
203239 positions , beta_matrix , valid_samples = read_region_from_h5 (
204240 h5file , all_samples , chrom , start , stop
205241 )
206242
207- # Validate that we have enough probes to fit a GP (minimum 3)
208243 if positions is None or len (positions ) < 3 :
209244 return {'error' : f'Too few probes in { chrom } :{ start } -{ stop } (need >= 3)' }
210245
@@ -284,67 +319,51 @@ def run_gpdm(params):
284319 length_scale_bp = int (ann .get ('length_scale_bp' , 1000 )),
285320 )
286321
287- # --- Step 5: Run both GP models ---
288- # method='both' fits NaiveGP and DomainPartitionedGP independently.
289- # Results are stored in analysis.results_naive and analysis.results_annotation.
290- # The annotation-aware model is set as the primary result (analysis.results).
291- analysis .run (method = 'both' )
292-
293- # --- Step 6: Build grid response for the D3 visualization ---
294- # to_dataframe() exports 500-point predictions aligned to a uniform grid.
295- # Column names use the group label strings passed to load_methylation:
296- # pred_group1, std_group1, pred_group2, std_group2
322+ # --- Step 5: Run annotation-aware GP model only ---
323+ # Skips NaiveGP to halve computation time. Results in analysis.results_annotation.
324+ analysis .run (method = 'annotation_aware' )
325+
326+ # --- Step 5b: Write visualization PNG to cache if a path was supplied ---
327+ plot_path = params .get ('plot_path' )
328+ if plot_path :
329+ try :
330+ import os
331+ import matplotlib .pyplot as plt
332+ os .makedirs (os .path .dirname (plot_path ), exist_ok = True )
333+ analysis .plot_results (results = analysis .results_annotation , save_path = plot_path , dark_theme = False )
334+ plt .close ('all' )
335+ except Exception :
336+ pass # non-fatal: analysis result still returned without image
337+
338+ # --- Step 6: Build grid response for the D3 visualization (termdb/gpdm) ---
339+ # termdb/dmr ignores this; termdb/gpdm needs it for all 4 visualization panels.
297340 grid_df = analysis .to_dataframe ()
298341
299342 def safe_list (arr ):
300- """
301- Convert a numpy array or pandas Series to a plain Python list,
302- replacing NaN and Inf values with None (JSON-serializable null).
303- The D3 visualization uses null to skip drawing at missing positions.
304- """
305343 return [None if (np .isnan (v ) or np .isinf (v )) else float (v ) for v in arr ]
306344
307- # Extract the four core GP prediction arrays from the DataFrame
308- pred_a = grid_df ['pred_group1' ].values # group A posterior mean
309- std_a = grid_df ['std_group1' ].values # group A posterior std
310- pred_b = grid_df ['pred_group2' ].values # group B posterior mean
311- std_b = grid_df ['std_group2' ].values # group B posterior std
345+ pred_a = grid_df ['pred_group1' ].values
346+ std_a = grid_df ['std_group1' ].values
347+ pred_b = grid_df ['pred_group2' ].values
348+ std_b = grid_df ['std_group2' ].values
312349
313- # Build the grid dict sent to the client.
314- # CI bands are computed as mean ± 1.96*std (approximates 95% credible interval
315- # for visualization purposes — the exact CI is in ci_lower/ci_upper for DMR calling).
316350 grid = {
317- 'positions' : safe_list (grid_df ['position' ]), # genomic x-axis
318- 'group_a_mean' : safe_list ( pred_a ), # group A GP mean line
319- 'group_a_lower' : safe_list (pred_a - 1.96 * std_a ), # group A lower CI band
320- 'group_a_upper' : safe_list (pred_a + 1.96 * std_a ), # group A upper CI band
321- 'group_b_mean' : safe_list ( pred_b ), # group B GP mean line
322- 'group_b_lower' : safe_list (pred_b - 1.96 * std_b ), # group B lower CI band
323- 'group_b_upper' : safe_list (pred_b + 1.96 * std_b ), # group B upper CI band
324- 'difference_mean' : safe_list (grid_df ['diff_mean' ]), # Delta(x) posterior mean
325- 'difference_lower' : safe_list (grid_df ['ci_lower' ]), # Delta(x) 95% CI lower
326- 'difference_upper' : safe_list (grid_df ['ci_upper' ]), # Delta(x) 95% CI upper
327- 'posterior_prob' : safe_list (grid_df ['prob_B_greater' ]),# P(group2 > group1) at each point
351+ 'positions' : safe_list (grid_df ['position' ]),
352+ 'group_a_mean' : safe_list ( pred_a ),
353+ 'group_a_lower' : safe_list (pred_a - 1.96 * std_a ),
354+ 'group_a_upper' : safe_list (pred_a + 1.96 * std_a ),
355+ 'group_b_mean' : safe_list ( pred_b ),
356+ 'group_b_lower' : safe_list (pred_b - 1.96 * std_b ),
357+ 'group_b_upper' : safe_list (pred_b + 1.96 * std_b ),
358+ 'difference_mean' : safe_list (grid_df ['diff_mean' ]),
359+ 'difference_lower' : safe_list (grid_df ['ci_lower' ]),
360+ 'difference_upper' : safe_list (grid_df ['ci_upper' ]),
361+ 'posterior_prob' : safe_list (grid_df ['prob_B_greater' ]),
328362 }
329363
330- # --- Step 7: Serialize naive DMRs ---
331- # These come from NaiveGP: a single global kernel, no annotation priors.
332- # Shown in the client as purple bars on the DMR track for comparison.
333- naive_dmrs = []
334- if analysis .results_naive and analysis .results_naive .dmrs :
335- for d in analysis .results_naive .dmrs :
336- naive_dmrs .append ({
337- 'chr' : chrom ,
338- 'start' : int (d .start ),
339- 'stop' : int (d .end ),
340- 'width' : int (d .width_bp ),
341- 'max_delta_beta' : float (d .max_delta_beta ),
342- 'probability' : float (d .mean_posterior_prob ),
343- })
344-
345- # --- Step 8: Serialize annotation-aware DMRs ---
346- # These come from DomainPartitionedGP: domain-specific priors and kernels.
347- # Shown as orange bars — the primary result shown to the user.
364+ # --- Step 7: Serialize annotation-aware DMRs ---
365+ # max_delta_beta is always positive (absolute peak effect size).
366+ # mean_delta_beta is signed: positive = group B (group2) > group A (group1) = hyper.
348367 annot_dmrs = []
349368 if analysis .results_annotation and analysis .results_annotation .dmrs :
350369 for d in analysis .results_annotation .dmrs :
@@ -354,21 +373,22 @@ def safe_list(arr):
354373 'stop' : int (d .end ),
355374 'width' : int (d .width_bp ),
356375 'max_delta_beta' : float (d .max_delta_beta ),
376+ 'direction' : 'hyper' if d .mean_delta_beta >= 0 else 'hypo' ,
357377 'probability' : float (d .mean_posterior_prob ),
358378 })
359379
360380 return {
361381 'status' : 'ok' ,
362- 'dmrs' : annot_dmrs , # annotation-aware DMRs (primary result)
363- 'naive_dmrs' : naive_dmrs , # naive DMRs (comparison reference)
364- 'grid' : grid , # 500-point posterior predictions for D3
382+ 'dmrs' : annot_dmrs ,
383+ 'naive_dmrs' : [] , # naive model not run; kept for termdb/gpdm client compatibility
384+ 'grid' : grid ,
365385 'metadata' : {
366- 'n_probes' : int (len (positions )), # probes used after filtering
367- 'n_probes_dropped' : n_dropped , # probes dropped by NaN threshold
368- 'n_nan_imputed' : nan_count , # individual NaN values imputed
369- 'n_samples_group1' : n_g1 , # group 1 sample count
370- 'n_samples_group2' : n_g2 , # group 2 sample count
371- 'region' : f'{ chrom } :{ start } -{ stop } ' , # region string for display
386+ 'n_probes' : int (len (positions )),
387+ 'n_probes_dropped' : n_dropped ,
388+ 'n_nan_imputed' : nan_count ,
389+ 'n_samples_group1' : n_g1 ,
390+ 'n_samples_group2' : n_g2 ,
391+ 'region' : f'{ chrom } :{ start } -{ stop } ' ,
372392 }
373393 }
374394
0 commit comments