3030
3131from rnacentral_pipeline .databases import data
3232from rnacentral_pipeline .databases .data import Entry , Exon , SequenceRegion
33+ from rnacentral_pipeline .databases .helpers import publications as pubs
3334from rnacentral_pipeline .databases .helpers import phylogeny as phy
3435from rnacentral_pipeline .rnacentral import lookup
3536
@@ -78,11 +79,34 @@ def handled_phylogeny(species: str) -> int:
7879
7980
8081def condense_publications (record ):
81- pubs_list = [record ["PMID_x" ]]
82- if record ["PMID_y" ] and not record ["PMID_y" ] in pubs_list :
83- pubs_list .append (record ["PMID_y" ])
84-
85- return pubs_list
82+ references = []
83+ seen = set ()
84+ for value in record :
85+ if pd .isna (value ):
86+ continue
87+ try :
88+ pmid = int (value )
89+ except (TypeError , ValueError ):
90+ continue
91+ if pmid in seen :
92+ continue
93+ seen .add (pmid )
94+ references .append (pubs .reference (pmid ))
95+ return references
96+
97+
98+ def resolve_sheet (db_dir : Path , basename : str ) -> Path :
99+ for suffix in (".xls" , ".tsv" ):
100+ candidate = db_dir .joinpath (f"{ basename } { suffix } " )
101+ if candidate .exists ():
102+ return candidate
103+ raise FileNotFoundError (f"Could not find { basename } .xls or { basename } .tsv in { db_dir } " )
104+
105+
106+ def load_table (path : Path ) -> pd .DataFrame :
107+ if path .suffix == ".tsv" :
108+ return pd .read_csv (path , sep = "\t " )
109+ return pd .read_excel (path )
86110
87111
88112def split (input_frame : pd .DataFrame ) -> tuple [pd .DataFrame , pd .DataFrame , pd .DataFrame ]:
@@ -96,9 +120,10 @@ def split(input_frame: pd.DataFrame) -> tuple[pd.DataFrame, pd.DataFrame, pd.Dat
96120 subset = "taxid"
97121 )
98122 print ("NCBI missing done" )
99- e_accessions = no_accessions [no_accessions ["Ensembl" ].notna ()]
123+ e_accessions = no_accessions [no_accessions ["Ensembl" ].notna ()]. copy ()
100124 print ("ensembl subset done" )
101- ncbi_accessions = input_frame [input_frame ["NCBI accession" ].notna ()]
125+ no_accessions = no_accessions [no_accessions ["Ensembl" ].isna ()].copy ()
126+ ncbi_accessions = input_frame [input_frame ["NCBI accession" ].notna ()].copy ()
102127 print ("NCBI subset done" )
103128 return (no_accessions , e_accessions , ncbi_accessions )
104129
@@ -199,60 +224,75 @@ def pull_ensembl_data(e_id: str):
199224
200225
201226def get_db_matches (match_frame_in : pd .DataFrame , db_dump : Path ) -> pd .DataFrame :
202- def split_clean_aliases (al ):
203- if al :
204- return [a .strip () for a in str (al ).split ("," )]
205- return np .nan
227+ def lookup_names (row ):
228+ names = [str (row ["external_id" ]).strip ()]
229+ aliases = row .get ("Aliases" )
230+ if pd .notna (aliases ):
231+ names .extend (a .strip () for a in str (aliases ).split ("," ) if a .strip ())
232+ return names
206233
207234 match_frame = match_frame_in .copy ()
208235 match_frame ["taxid" ] = match_frame ["taxid" ].astype (int )
209-
210- match_frame .rename (columns = {"Name" : "external_id" }, inplace = True )
211- match_frame ["external_id" ] = match_frame ["external_id" ].apply (split_clean_aliases )
236+ match_frame ["lookup_name" ] = match_frame .apply (lookup_names , axis = "columns" )
212237 match_frame = (
213- match_frame .explode ("external_id " )
238+ match_frame .explode ("lookup_name " )
214239 .replace (to_replace = ["None" ], value = np .nan )
215- .dropna (subset = "external_id" )
240+ .dropna (subset = "lookup_name" )
241+ )
242+ match_frame ["is_exact_match" ] = (
243+ match_frame ["lookup_name" ] == match_frame ["external_id" ]
216244 )
217245
218- rnc_data = pd .read_csv (db_dump , names = ["urs" , "taxid" , "external_id " ], header = 0 )
219- rnc_data ["external_id " ] = rnc_data ["external_id " ].apply (lambda x : str (x ).split ("|" ))
246+ rnc_data = pd .read_csv (db_dump , names = ["urs" , "taxid" , "lookup_name " ], header = 0 )
247+ rnc_data ["lookup_name " ] = rnc_data ["lookup_name " ].apply (lambda x : str (x ).split ("|" ))
220248 rnc_data = (
221- rnc_data .explode ("external_id " )
249+ rnc_data .explode ("lookup_name " )
222250 .replace (to_replace = ["" , None ], value = np .nan )
223- .dropna (subset = "external_id " )
251+ .dropna (subset = "lookup_name " )
224252 )
225253
226254 matches = match_frame .merge (
227255 rnc_data ,
228- left_on = ["external_id " , "taxid" ],
229- right_on = ["external_id " , "taxid" ],
256+ left_on = ["lookup_name " , "taxid" ],
257+ right_on = ["lookup_name " , "taxid" ],
230258 how = "inner" ,
231259 )
260+ matches .sort_values (["ID" , "is_exact_match" ], ascending = [True , False ], inplace = True )
232261
233262 return matches
234263
235264
265+ def load_function_data (function_info : Path ) -> pd .DataFrame :
266+ function_df = load_table (function_info )
267+ return (
268+ function_df .groupby ("ID" , sort = False )["PMID" ]
269+ .apply (condense_publications )
270+ .reset_index (name = "publications" )
271+ )
272+
273+
236274def parse (db_dir : Path , db_dumps : tuple [Path ], db_url : str ) -> None :
237275 """
238- Parses the 3 excel sheets using pandas and joins them into one massive table
239- which is then parsed to produce entries
276+ Parse and join the two EVLncRNAs3 workbooks and build RNAcentral entries.
240277 """
241- lncRNA = db_dir .joinpath ("lncRNA.xlsx" )
242- interaction = db_dir .joinpath ("interaction2.xlsx" )
243- disease = db_dir .joinpath ("disease2.xlsx" )
244-
245- assert lncRNA .exists () and interaction .exists () and disease .exists ()
246-
247- lncRNA_df = pd .read_excel (lncRNA )
248- interaction_df = pd .read_excel (interaction )
249- disease_df = pd .read_excel (disease )
278+ lncRNA = resolve_sheet (db_dir , "lncRNA_information" )
279+ function_info = resolve_sheet (db_dir , "function_information" )
280+
281+ lncRNA_df = load_table (lncRNA )
282+ function_df = load_function_data (function_info )
283+ lncRNA_df .rename (
284+ columns = {
285+ "LncRNA name" : "external_id" ,
286+ "Alias" : "Aliases" ,
287+ },
288+ inplace = True ,
289+ )
250290
251- print ("Loaded 3 sheets..." )
291+ print ("Loaded EVLncRNAs3 sheets..." )
252292
253- lncRNA_df ["taxid" ] = (
254- lncRNA_df [ "Species" ]. apply ( handled_phylogeny ). dropna (). astype ( int )
255- )
293+ lncRNA_df ["taxid" ] = lncRNA_df [ "Species" ]. apply ( handled_phylogeny )
294+ lncRNA_df = lncRNA_df . dropna ( subset = [ "taxid" ]). copy ( )
295+ lncRNA_df [ "taxid" ] = lncRNA_df [ "taxid" ]. astype ( int )
256296
257297 ## Split the data on the presence of accessions for either NCBI or Ensembl
258298 no_accession_frame , ensembl_frame , ncbi_frame = split (lncRNA_df )
@@ -272,12 +312,12 @@ def parse(db_dir: Path, db_dumps: tuple[Path], db_url: str) -> None:
272312 ## Match with RNAcentral based on the gene name
273313 ## This is optionally chunked to save memory -
274314 ## split the lookup file and provide a list on the commandline
275- matched_frame = pd .concat (
276- [get_db_matches (no_accession_frame , dump_chunk ) for dump_chunk in db_dumps ]
315+ matched_chunks = [get_db_matches (no_accession_frame , dump_chunk ) for dump_chunk in db_dumps ]
316+ matched_frame = pd .concat (matched_chunks , ignore_index = True )
317+ matched_frame .drop_duplicates (subset = "ID" , inplace = True )
318+ matched_frame ["urs_taxid" ] = (
319+ matched_frame ["urs" ] + "_" + matched_frame ["taxid" ].astype (str )
277320 )
278- matched_frame ["taxid" ] = matched_frame ["taxid" ].astype (str )
279- matched_frame ["urs_taxid" ] = matched_frame [["urs" , "taxid" ]].agg ("_" .join , axis = 1 )
280- matched_frame .drop_duplicates (subset = "urs_taxid" , inplace = True )
281321
282322 ## Look up the rest of the data for the hits
283323 mapping = lookup .as_mapping (db_url , matched_frame ["urs_taxid" ].values , QUERY )
@@ -289,59 +329,22 @@ def parse(db_dir: Path, db_dumps: tuple[Path], db_url: str) -> None:
289329 lambda x : mapping [x ]["sequence" ]
290330 )
291331
292- ## Build frame with all hits & accessions
293- ## The full frame is then merged with the disease and interaction frames
294- full_frame = pd .concat ([matched_frame , ensembl_frame , ncbi_frame ])
295-
296- full_frame = full_frame .merge (
297- disease_df .drop (
298- columns = ["Name" , "Species" , "Species category" , "exosome" , "structure" ]
299- ),
300- how = "left" ,
301- on = "ID" ,
302- )
303-
304- full_frame = full_frame .merge (
305- interaction_df .drop (columns = ["Name" , "Species" , "Species category" ]),
306- how = "left" ,
307- on = "ID" ,
332+ ## Build frame with all hits & accessions and add aggregated publication data
333+ full_frame = pd .concat ([matched_frame , ensembl_frame , ncbi_frame ], ignore_index = True )
334+ full_frame .drop_duplicates (subset = "ID" , inplace = True )
335+ full_frame = full_frame .merge (function_df , how = "left" , on = "ID" )
336+ full_frame ["publications" ] = full_frame ["publications" ].apply (
337+ lambda refs : refs if isinstance (refs , list ) else []
308338 )
309339
310- ## Try to ensure one entry per URS_taxid
311- full_frame .drop_duplicates (subset = "urs_taxid" , inplace = True )
312-
313340 ## Tidy up and apply some normalisations
314- full_frame ["publications" ] = full_frame .apply (condense_publications , axis = "columns" )
315341 full_frame ["Chain" ] = full_frame ["Chain" ].apply (
316- lambda x : chain_normalisation .get (x , None )
342+ lambda x : chain_normalisation .get (str ( x ). lower () , None ) if pd . notna ( x ) else None
317343 )
318344 full_frame ["so_type" ] = full_frame ["Class" ].apply (
319345 lambda x : type_normalisation .get (x , "SO:0000655" )
320346 )
321347
322- ## Tidy up and rename some columns
323- full_frame .drop (
324- columns = [
325- "Species category" ,
326- "peptide" ,
327- "circRNA" ,
328- "exosome" ,
329- "structure" ,
330- "Disease category" ,
331- "Methods_x" ,
332- "Sample" ,
333- "Expression pattern" ,
334- "Dysfunction type" ,
335- "Description of disease/function" ,
336- "Source" ,
337- "drug Resistance/chemoresistance/stress" ,
338- "PDBlink" ,
339- "Description of interaction" ,
340- "Methods_y" ,
341- ],
342- inplace = True ,
343- )
344-
345348 full_frame .replace ({np .nan : None }, inplace = True )
346349
347350 ## yield entry objects for each row in the frame, these get written directly.
0 commit comments