@@ -888,14 +888,24 @@ def _gene_cnv(
888
888
chunks ,
889
889
inline_array ,
890
890
):
891
- debug = self ._log .debug
892
-
893
- debug ("sanity check" )
891
+ # Sanity check.
894
892
assert isinstance (region , Region )
895
893
896
- debug ("access HMM data" )
894
+ # Access genes within the region of interest.
895
+ df_genome_features = self .genome_features (region = region )
896
+ sample_query_options = sample_query_options or {}
897
+ df_genes = df_genome_features .query (
898
+ f"type == '{ self ._gff_gene_type } '" , ** sample_query_options
899
+ )
900
+
901
+ # Refine the region for CNV data to ensure coverage of all requested genes.
902
+ cnv_region = Region (
903
+ region .contig , df_genes ["start" ].min (), df_genes ["end" ].max ()
904
+ )
905
+
906
+ # Access HMM data.
897
907
ds_hmm = self .cnv_hmm (
898
- region = region . contig ,
908
+ region = cnv_region ,
899
909
sample_sets = sample_sets ,
900
910
sample_query = sample_query ,
901
911
sample_query_options = sample_query_options ,
@@ -909,45 +919,38 @@ def _gene_cnv(
909
919
with self ._dask_progress (desc = "Load CNV HMM data" ):
910
920
pos , end , cn = dask .compute (pos , end , cn )
911
921
912
- debug ("access genes" )
913
- df_genome_features = self .genome_features (region = region )
914
- sample_query_options = sample_query_options or {}
915
- df_genes = df_genome_features .query (
916
- f"type == '{ self ._gff_gene_type } '" , ** sample_query_options
917
- )
918
-
919
- debug ("setup intermediates" )
922
+ # Set up intermediates.
920
923
windows = []
921
924
modes = []
922
925
counts = []
923
926
924
- debug ( "iterate over genes" )
927
+ # Iterate over genes.
925
928
genes_iterator = self ._progress (
926
929
df_genes .itertuples (),
927
930
desc = "Compute modal gene copy number" ,
928
931
total = len (df_genes ),
929
932
)
930
933
for gene in genes_iterator :
931
- # locate windows overlapping the gene
934
+ # Locate windows overlapping the gene.
932
935
loc_gene_start = bisect_left (end , gene .start )
933
936
loc_gene_stop = bisect_right (pos , gene .end )
934
937
w = loc_gene_stop - loc_gene_start
935
938
windows .append (w )
936
939
937
- # slice out copy number data for the given gene
940
+ # Slice out copy number data for the given gene.
938
941
cn_gene = cn [loc_gene_start :loc_gene_stop ]
939
942
940
- # compute the modes
943
+ # Compute the modes.
941
944
m , c = _cn_mode (cn_gene , vmax = 12 )
942
945
modes .append (m )
943
946
counts .append (c )
944
947
945
- debug ( "combine results" )
948
+ # Combine results.
946
949
windows = np .array (windows )
947
950
modes = np .vstack (modes )
948
951
counts = np .vstack (counts )
949
952
950
- debug ( "build dataset" )
953
+ # Build dataset.
951
954
ds_out = xr .Dataset (
952
955
coords = {
953
956
"gene_id" : (["genes" ], df_genes ["ID" ].values ),
@@ -1182,6 +1185,11 @@ def _gene_cnv_frequencies(
1182
1185
1183
1186
freq_cols [f"frq_{ coh } " ] = np .concatenate ([amp_freq_coh , del_freq_coh ])
1184
1187
1188
+ if len (coh_dict ) == 0 :
1189
+ raise ValueError (
1190
+ "No cohorts available for the given sample selection parameters and minimum cohort size."
1191
+ )
1192
+
1185
1193
debug ("build a dataframe with the frequency columns" )
1186
1194
df_freqs = pd .DataFrame (freq_cols )
1187
1195
0 commit comments