1414ZERO_HAMMING_DISTANCE_SPANS = "zero_hamming_distance_spans"
1515
1616
17+ def _max_positive_span_length (delta_row : "np.ndarray" ) -> int :
18+ """Return the max contiguous run length where delta span values are > 0."""
19+ import numpy as np
20+
21+ values = np .asarray (delta_row )
22+ if values .ndim != 1 or values .size == 0 :
23+ return 0
24+
25+ positive_mask = values > 0
26+ if not np .any (positive_mask ):
27+ return 0
28+
29+ transitions = np .diff (positive_mask .astype (np .int8 ))
30+ starts = np .flatnonzero (transitions == 1 ) + 1
31+ ends = np .flatnonzero (transitions == - 1 ) + 1
32+
33+ if positive_mask [0 ]:
34+ starts = np .r_ [0 , starts ]
35+ if positive_mask [- 1 ]:
36+ ends = np .r_ [ends , positive_mask .size ]
37+
38+ return int (np .max (ends - starts ))
39+
40+
41+ def _compute_chimeric_by_mod_hamming_distance (
42+ delta_layer : "np.ndarray" ,
43+ span_threshold : int ,
44+ ) -> "np.ndarray" :
45+ """Flag reads with any delta-hamming span strictly larger than ``span_threshold``."""
46+ import numpy as np
47+
48+ delta_values = np .asarray (delta_layer )
49+ if delta_values .ndim != 2 :
50+ raise ValueError ("delta_layer must be a 2D array with shape (n_obs, n_vars)." )
51+
52+ flags = np .zeros (delta_values .shape [0 ], dtype = bool )
53+ for obs_idx , row in enumerate (delta_values ):
54+ flags [obs_idx ] = _max_positive_span_length (row ) > span_threshold
55+ return flags
56+
57+
1758def _build_top_segments_obs_tuples (
1859 read_df : "pd.DataFrame" ,
1960 obs_names : "pd.Index" ,
@@ -295,7 +336,10 @@ def chimeric_adata_core(
295336 suffix = "_strand_FASTA_base"
296337 variant_seq1_label = seq1_col [: - len (suffix )] if seq1_col .endswith (suffix ) else seq1_col
297338 variant_seq2_label = seq2_col [: - len (suffix )] if seq2_col .endswith (suffix ) else seq2_col
298- logger .info ("Detected variant call layer '%s'; will overlay on span clustermaps." , variant_call_layer_name )
339+ logger .info (
340+ "Detected variant call layer '%s'; will overlay on span clustermaps." ,
341+ variant_call_layer_name ,
342+ )
299343
300344 # ============================================================
301345 # 1) Rolling NN distances + layer clustermaps
@@ -625,7 +669,9 @@ def chimeric_adata_core(
625669 _vc = adata [mask ].layers [variant_call_layer_name ]
626670 _vc = _vc .toarray () if hasattr (_vc , "toarray" ) else np .asarray (_vc )
627671 _variant_call_df = pd .DataFrame (
628- _vc , index = adata [mask ].obs_names .astype (str ), columns = adata .var_names ,
672+ _vc ,
673+ index = adata [mask ].obs_names .astype (str ),
674+ columns = adata .var_names ,
629675 )
630676
631677 subset = subset [:, site_mask ].copy ()
@@ -641,6 +687,9 @@ def chimeric_adata_core(
641687 variant_call_data = _variant_call_df ,
642688 seq1_label = variant_seq1_label ,
643689 seq2_label = variant_seq2_label ,
690+ ref1_marker_color = getattr (cfg , "variant_overlay_seq1_color" , "white" ),
691+ ref2_marker_color = getattr (cfg , "variant_overlay_seq2_color" , "black" ),
692+ variant_marker_size = getattr (cfg , "variant_overlay_marker_size" , 4.0 ),
644693 title = title ,
645694 save_name = out_png ,
646695 )
@@ -1000,7 +1049,9 @@ def chimeric_adata_core(
10001049 _vc = adata [sample_mask ].layers [variant_call_layer_name ]
10011050 _vc = _vc .toarray () if hasattr (_vc , "toarray" ) else np .asarray (_vc )
10021051 _cross_variant_call_df = pd .DataFrame (
1003- _vc , index = adata [sample_mask ].obs_names .astype (str ), columns = adata .var_names ,
1052+ _vc ,
1053+ index = adata [sample_mask ].obs_names .astype (str ),
1054+ columns = adata .var_names ,
10041055 )
10051056
10061057 # --- Plots ---
@@ -1073,6 +1124,15 @@ def chimeric_adata_core(
10731124 variant_call_data = _cross_variant_call_df ,
10741125 seq1_label = variant_seq1_label ,
10751126 seq2_label = variant_seq2_label ,
1127+ ref1_marker_color = getattr (
1128+ cfg , "variant_overlay_seq1_color" , "white"
1129+ ),
1130+ ref2_marker_color = getattr (
1131+ cfg , "variant_overlay_seq2_color" , "black"
1132+ ),
1133+ variant_marker_size = getattr (
1134+ cfg , "variant_overlay_marker_size" , 4.0
1135+ ),
10761136 title = title ,
10771137 save_name = out_png ,
10781138 )
@@ -1146,12 +1206,31 @@ def chimeric_adata_core(
11461206 )
11471207 delta_layer = np .clip (within_layer - cross_layer , 0 , None )
11481208 adata .layers [DELTA_ZERO_HAMMING_DISTANCE_SPANS ] = delta_layer
1209+ threshold = getattr (cfg , "delta_hamming_chimeric_span_threshold" , 200 )
1210+ try :
1211+ threshold = int (threshold )
1212+ except (TypeError , ValueError ):
1213+ logger .warning (
1214+ "Invalid delta_hamming_chimeric_span_threshold=%s; using default 200." ,
1215+ threshold ,
1216+ )
1217+ threshold = 200
1218+ if threshold < 0 :
1219+ logger .warning (
1220+ "delta_hamming_chimeric_span_threshold=%s is negative; clamping to 0." ,
1221+ threshold ,
1222+ )
1223+ threshold = 0
1224+ adata .obs ["chimeric_by_mod_hamming_distance" ] = (
1225+ _compute_chimeric_by_mod_hamming_distance (delta_layer , threshold )
1226+ )
11491227 else :
11501228 logger .warning (
11511229 "Cannot compute delta: missing %s or %s layer." ,
11521230 ZERO_HAMMING_DISTANCE_SPANS ,
11531231 CROSS_SAMPLE_ZERO_HAMMING_DISTANCE_SPANS ,
11541232 )
1233+ adata .obs ["chimeric_by_mod_hamming_distance" ] = False
11551234
11561235 if DELTA_ZERO_HAMMING_DISTANCE_SPANS in adata .layers :
11571236 for reference in references :
@@ -1300,7 +1379,9 @@ def chimeric_adata_core(
13001379 .astype ("category" )
13011380 .cat .categories .tolist ()
13021381 )
1303- references = adata .obs [cfg .reference_column ].astype ("category" ).cat .categories .tolist ()
1382+ references = (
1383+ adata .obs [cfg .reference_column ].astype ("category" ).cat .categories .tolist ()
1384+ )
13041385
13051386 for reference in references :
13061387 ref_mask = adata .obs [cfg .reference_column ] == reference
@@ -1332,9 +1413,7 @@ def chimeric_adata_core(
13321413 safe_sample = str (sample ).replace (os .sep , "_" )
13331414 safe_ref = str (reference ).replace (os .sep , "_" )
13341415 n_reads = int (sample_mask .sum ())
1335- trio_title = (
1336- f"{ sample } { reference } (n={ n_reads } )"
1337- )
1416+ trio_title = f"{ sample } { reference } (n={ n_reads } )"
13381417 out_png = span_trio_dir / f"{ safe_sample } __{ safe_ref } .png"
13391418 try :
13401419 plot_hamming_span_trio (
@@ -1345,6 +1424,15 @@ def chimeric_adata_core(
13451424 variant_call_data = _variant_call_df ,
13461425 seq1_label = variant_seq1_label ,
13471426 seq2_label = variant_seq2_label ,
1427+ ref1_marker_color = getattr (
1428+ cfg , "variant_overlay_seq1_color" , "white"
1429+ ),
1430+ ref2_marker_color = getattr (
1431+ cfg , "variant_overlay_seq2_color" , "black"
1432+ ),
1433+ variant_marker_size = getattr (
1434+ cfg , "variant_overlay_marker_size" , 4.0
1435+ ),
13481436 title = trio_title ,
13491437 save_name = out_png ,
13501438 )
0 commit comments