@@ -117,26 +117,58 @@ def get_genomic_mane_genes(
117117 :param end: Genomic end position. Assumes residue coordinates.
118118 :return: Unique MANE gene(s) found for a genomic location
119119 """
120+ # Only interested in rows where genomic location lives
120121 mane_rows = self .df .filter (
121122 (start >= pl .col ("chr_start" ))
122123 & (end <= pl .col ("chr_end" ))
123124 & (pl .col ("GRCh38_chr" ) == ac )
124- ). unique ( subset = [ "#NCBI_GeneID" ])
125+ )
125126
126- if len ( mane_rows ) == 0 :
127+ if mane_rows . is_empty () :
127128 return []
128129
129- mane_rows = mane_rows .with_columns (
130- pl .col ("#NCBI_GeneID" )
131- .str .split_exact (":" , 1 )
132- .struct .field ("field_1" )
133- .cast (pl .Int32 )
134- .alias ("ncbi_gene_id" ),
135- pl .col ("HGNC_ID" )
136- .str .split_exact (":" , 1 )
137- .struct .field ("field_1" )
138- .cast (pl .Int32 )
139- .alias ("hgnc_id" ),
130+ # Group rows by NCBI ID, transform values to representation we want, MANE status
131+ # will be converted to list with DESC order
132+ mane_rows = mane_rows .group_by ("#NCBI_GeneID" ).agg (
133+ [
134+ pl .col ("#NCBI_GeneID" )
135+ .first ()
136+ .str .split_exact (":" , 1 )
137+ .struct .field ("field_1" )
138+ .cast (pl .Int32 )
139+ .alias ("ncbi_gene_id" ),
140+ pl .col ("HGNC_ID" )
141+ .first ()
142+ .str .split_exact (":" , 1 )
143+ .struct .field ("field_1" )
144+ .cast (pl .Int32 )
145+ .alias ("hgnc_id" ),
146+ pl .col ("MANE_status" )
147+ .unique ()
148+ .str .to_lowercase ()
149+ .str .replace_all (" " , "_" )
150+ .alias ("status" )
151+ .sort (descending = True ),
152+ pl .col ("symbol" ).first (),
153+ ]
154+ )
155+
156+ # Sort final rows based on MANE status
157+ # First by length (which means gene has both select and plus clinical)
158+ # Then by DESC order
159+ # Then by NCBI ID ASC order
160+ mane_rows = (
161+ mane_rows .with_columns (
162+ [
163+ pl .col ("status" ).list .len ().alias ("status_count" ),
164+ pl .col ("status" ).list .join ("_" ).alias ("status_str" ),
165+ pl .col ("ncbi_gene_id" ),
166+ ]
167+ )
168+ .sort (
169+ ["status_count" , "status_str" , "ncbi_gene_id" ],
170+ descending = [True , True , False ],
171+ )
172+ .drop (["status_count" , "status_str" , "#NCBI_GeneID" ])
140173 )
141- mane_rows = mane_rows .select (["ncbi_gene_id" , "hgnc_id" , "symbol" ])
142174 return [ManeGeneData (** mane_gene ) for mane_gene in mane_rows .to_dicts ()]
0 commit comments