Update evlncrnas parser for version 3 and added tests

Vicbeg · Vicbeg · commit 83a71aa55073 · 2026-03-17T16:13:36.000Z
diff --git a/data/evlncrnas/function_information.tsv b/data/evlncrnas/function_information.tsv
@@ -0,0 +1,3 @@
+ID	LncRNA name	Species	Biological processes	Cellular component	Molecular fucntions	Clinical applications	Disease	Disease category	drug Resistance/chemoresistance/stress	Mutation	Methods	Sample	Organoid	Expression pattern	Function type	Interaction target	Mode of interaction	Expression pattern of Interaction target	Level of interaction	Type of interaction	Detailed Pathway	PMID	Year	Description of disease/function/interaction
+EL3692	MALAT1	Homo sapiens	Disease-relevant functions		Gene regulation	Disease diagnosis	gallbladder cancer	Cancer			Microarray, knockdown	in GBC tissues and cell lines.		up-regulated	interaction	ABI3BP	Regulate		lncRNA-DNA	regulation	MALAT1 down-regulates ABI3BP through EZH2 silencing of H3K27 methylation	31174563	2019	Long noncoding RNA MALAT1 potentiates growth and inhibits senescence by antagonizing ABI3BP in gallbladder cancer cells.
+EL3692	MALAT1	Homo sapiens	Apoptosis		Transcription regulation	Disease treatment	endometriosis	Others			knockdown, Western blot	endometriosis cell lines		up-regulated	interaction	AMPK	Regulate	down-regulated	lncRNA-mRNA	regulation	MALAT1 represses AMPK	33235630	2021	Therefore, MALAT1 may regulate GC proliferation via AMPK-mTOR-mediated cell apoptosis and autophagy.
diff --git a/data/evlncrnas/lncRNA_information.tsv b/data/evlncrnas/lncRNA_information.tsv
@@ -0,0 +1,2 @@
+ID	LncRNA name	Alias	Species	Species category	Chromosome	Start site	End site	Chain	Exon NO.	Class	Assembly	NCBI accession	Ensembl	Peptide	Peptide name	Peptide length	Peptide sequence	CircRNA	Exosome	Structure	PDBlink	Phase separation	Location	ChatGPT	Homologous LncRNA
+EL3692	MALAT1	HCN, LINC00047, NCRNA00047, NEAT2, PRO2853	Homo sapiens	Human	11	65497738	65506516	plus	2	lincRNA	GRCh38.p14	NR_002819, NR_144567, NR_144568	ENSG00000251562					1	1	1	4PLX	1	Chromatin, Cytoplasm, Cytosol, Mitochondrion, Nuclear, Nucleus, Nuclear speckle, Nucleoplasm, Exosome, Speckle periphery		EL3691;EL3693;EL3694;EL3695
diff --git a/pyproject.toml b/pyproject.toml
@@ -36,6 +36,7 @@ dependencies = [
     "throttler>=1.2.0",
     "nltk>=3.8.1",
     "openpyxl>=3.0.10",
+    "xlrd>=2.0.1",
     "pybedtools>=0.12.0",
     "psycopg2-binary>=2.9.7",
     "polars>=1.14.0",
diff --git a/rnacentral_pipeline/databases/evlncrnas/helpers.py b/rnacentral_pipeline/databases/evlncrnas/helpers.py
@@ -1,6 +1,4 @@
-import numpy as np
-
-from rnacentral_pipeline.databases.data import Entry, Exon, SequenceRegion
+from rnacentral_pipeline.databases.data import Entry
 from rnacentral_pipeline.databases.helpers import phylogeny as phy
 
 
@@ -21,13 +19,11 @@ def sequence(record):
 
 
 def aliases(record):
-    print(f'Name: {record["external_id"]}, aliases: {record["Aliases"]}')
-
     if record["Aliases"] is None:
         return [str(record["external_id"])]
 
     aliases = [str(record["external_id"])]
-    aliases.extend(str(record["Aliases"]).split(","))
+    aliases.extend(a.strip() for a in str(record["Aliases"]).split(",") if a.strip())
 
     return aliases
 
@@ -41,10 +37,7 @@ def rna_type(record):
 
 
 def url(record):
-    return (
-        "https://www.sdklab-biophysics-dzu.net/EVLncRNAs2/index.php/Home/Browsc/rna.html?id="
-        + record["ID"]
-    )
+    return f"https://www.sdklab-biophysics-dzu.net/EVLncRNAs3/#/detail?id={record['ID']}"
 
 
 def description(record):
diff --git a/rnacentral_pipeline/databases/evlncrnas/parser.py b/rnacentral_pipeline/databases/evlncrnas/parser.py
@@ -30,6 +30,7 @@
 
 from rnacentral_pipeline.databases import data
 from rnacentral_pipeline.databases.data import Entry, Exon, SequenceRegion
+from rnacentral_pipeline.databases.helpers import publications as pubs
 from rnacentral_pipeline.databases.helpers import phylogeny as phy
 from rnacentral_pipeline.rnacentral import lookup
 
@@ -78,11 +79,34 @@ def handled_phylogeny(species: str) -> int:
 
 
 def condense_publications(record):
-    pubs_list = [record["PMID_x"]]
-    if record["PMID_y"] and not record["PMID_y"] in pubs_list:
-        pubs_list.append(record["PMID_y"])
-
-    return pubs_list
+    references = []
+    seen = set()
+    for value in record:
+        if pd.isna(value):
+            continue
+        try:
+            pmid = int(value)
+        except (TypeError, ValueError):
+            continue
+        if pmid in seen:
+            continue
+        seen.add(pmid)
+        references.append(pubs.reference(pmid))
+    return references
+
+
+def resolve_sheet(db_dir: Path, basename: str) -> Path:
+    for suffix in (".xls", ".tsv"):
+        candidate = db_dir.joinpath(f"{basename}{suffix}")
+        if candidate.exists():
+            return candidate
+    raise FileNotFoundError(f"Could not find {basename}.xls or {basename}.tsv in {db_dir}")
+
+
+def load_table(path: Path) -> pd.DataFrame:
+    if path.suffix == ".tsv":
+        return pd.read_csv(path, sep="\t")
+    return pd.read_excel(path)
 
 
 def split(input_frame: pd.DataFrame) -> tuple[pd.DataFrame, pd.DataFrame, pd.DataFrame]:
@@ -96,9 +120,10 @@ def split(input_frame: pd.DataFrame) -> tuple[pd.DataFrame, pd.DataFrame, pd.Dat
         subset="taxid"
     )
     print("NCBI missing done")
-    e_accessions = no_accessions[no_accessions["Ensembl"].notna()]
+    e_accessions = no_accessions[no_accessions["Ensembl"].notna()].copy()
     print("ensembl subset done")
-    ncbi_accessions = input_frame[input_frame["NCBI accession"].notna()]
+    no_accessions = no_accessions[no_accessions["Ensembl"].isna()].copy()
+    ncbi_accessions = input_frame[input_frame["NCBI accession"].notna()].copy()
     print("NCBI subset done")
     return (no_accessions, e_accessions, ncbi_accessions)
 
@@ -199,60 +224,75 @@ def pull_ensembl_data(e_id: str):
 
 
 def get_db_matches(match_frame_in: pd.DataFrame, db_dump: Path) -> pd.DataFrame:
-    def split_clean_aliases(al):
-        if al:
-            return [a.strip() for a in str(al).split(",")]
-        return np.nan
+    def lookup_names(row):
+        names = [str(row["external_id"]).strip()]
+        aliases = row.get("Aliases")
+        if pd.notna(aliases):
+            names.extend(a.strip() for a in str(aliases).split(",") if a.strip())
+        return names
 
     match_frame = match_frame_in.copy()
     match_frame["taxid"] = match_frame["taxid"].astype(int)
-
-    match_frame.rename(columns={"Name": "external_id"}, inplace=True)
-    match_frame["external_id"] = match_frame["external_id"].apply(split_clean_aliases)
+    match_frame["lookup_name"] = match_frame.apply(lookup_names, axis="columns")
     match_frame = (
-        match_frame.explode("external_id")
+        match_frame.explode("lookup_name")
         .replace(to_replace=["None"], value=np.nan)
-        .dropna(subset="external_id")
+        .dropna(subset="lookup_name")
+    )
+    match_frame["is_exact_match"] = (
+        match_frame["lookup_name"] == match_frame["external_id"]
     )
 
-    rnc_data = pd.read_csv(db_dump, names=["urs", "taxid", "external_id"], header=0)
-    rnc_data["external_id"] = rnc_data["external_id"].apply(lambda x: str(x).split("|"))
+    rnc_data = pd.read_csv(db_dump, names=["urs", "taxid", "lookup_name"], header=0)
+    rnc_data["lookup_name"] = rnc_data["lookup_name"].apply(lambda x: str(x).split("|"))
     rnc_data = (
-        rnc_data.explode("external_id")
+        rnc_data.explode("lookup_name")
         .replace(to_replace=["", None], value=np.nan)
-        .dropna(subset="external_id")
+        .dropna(subset="lookup_name")
     )
 
     matches = match_frame.merge(
         rnc_data,
-        left_on=["external_id", "taxid"],
-        right_on=["external_id", "taxid"],
+        left_on=["lookup_name", "taxid"],
+        right_on=["lookup_name", "taxid"],
         how="inner",
     )
+    matches.sort_values(["ID", "is_exact_match"], ascending=[True, False], inplace=True)
 
     return matches
 
 
+def load_function_data(function_info: Path) -> pd.DataFrame:
+    function_df = load_table(function_info)
+    return (
+        function_df.groupby("ID", sort=False)["PMID"]
+        .apply(condense_publications)
+        .reset_index(name="publications")
+    )
+
+
 def parse(db_dir: Path, db_dumps: tuple[Path], db_url: str) -> None:
     """
-    Parses the 3 excel sheets using pandas and joins them into one massive table
-    which is then parsed to produce entries
+    Parse and join the two EVLncRNAs3 workbooks and build RNAcentral entries.
     """
-    lncRNA = db_dir.joinpath("lncRNA.xlsx")
-    interaction = db_dir.joinpath("interaction2.xlsx")
-    disease = db_dir.joinpath("disease2.xlsx")
-
-    assert lncRNA.exists() and interaction.exists() and disease.exists()
-
-    lncRNA_df = pd.read_excel(lncRNA)
-    interaction_df = pd.read_excel(interaction)
-    disease_df = pd.read_excel(disease)
+    lncRNA = resolve_sheet(db_dir, "lncRNA_information")
+    function_info = resolve_sheet(db_dir, "function_information")
+
+    lncRNA_df = load_table(lncRNA)
+    function_df = load_function_data(function_info)
+    lncRNA_df.rename(
+        columns={
+            "LncRNA name": "external_id",
+            "Alias": "Aliases",
+        },
+        inplace=True,
+    )
 
-    print("Loaded 3 sheets...")
+    print("Loaded EVLncRNAs3 sheets...")
 
-    lncRNA_df["taxid"] = (
-        lncRNA_df["Species"].apply(handled_phylogeny).dropna().astype(int)
-    )
+    lncRNA_df["taxid"] = lncRNA_df["Species"].apply(handled_phylogeny)
+    lncRNA_df = lncRNA_df.dropna(subset=["taxid"]).copy()
+    lncRNA_df["taxid"] = lncRNA_df["taxid"].astype(int)
 
     ## Split the data on the presence of accessions for either NCBI or Ensembl
     no_accession_frame, ensembl_frame, ncbi_frame = split(lncRNA_df)
@@ -272,12 +312,12 @@ def parse(db_dir: Path, db_dumps: tuple[Path], db_url: str) -> None:
     ## Match with RNAcentral based on the gene name
     ## This is optionally chunked to save memory - 
     ## split the lookup file and provide a list on the commandline
-    matched_frame = pd.concat(
-        [get_db_matches(no_accession_frame, dump_chunk) for dump_chunk in db_dumps]
+    matched_chunks = [get_db_matches(no_accession_frame, dump_chunk) for dump_chunk in db_dumps]
+    matched_frame = pd.concat(matched_chunks, ignore_index=True)
+    matched_frame.drop_duplicates(subset="ID", inplace=True)
+    matched_frame["urs_taxid"] = (
+        matched_frame["urs"] + "_" + matched_frame["taxid"].astype(str)
     )
-    matched_frame["taxid"] = matched_frame["taxid"].astype(str)
-    matched_frame["urs_taxid"] = matched_frame[["urs", "taxid"]].agg("_".join, axis=1)
-    matched_frame.drop_duplicates(subset="urs_taxid", inplace=True)
 
     ## Look up the rest of the data for the hits
     mapping = lookup.as_mapping(db_url, matched_frame["urs_taxid"].values, QUERY)
@@ -289,59 +329,22 @@ def parse(db_dir: Path, db_dumps: tuple[Path], db_url: str) -> None:
         lambda x: mapping[x]["sequence"]
     )
 
-    ## Build frame with all hits & accessions
-    ## The full frame is then merged with the disease and interaction frames
-    full_frame = pd.concat([matched_frame, ensembl_frame, ncbi_frame])
-
-    full_frame = full_frame.merge(
-        disease_df.drop(
-            columns=["Name", "Species", "Species category", "exosome", "structure"]
-        ),
-        how="left",
-        on="ID",
-    )
-
-    full_frame = full_frame.merge(
-        interaction_df.drop(columns=["Name", "Species", "Species category"]),
-        how="left",
-        on="ID",
+    ## Build frame with all hits & accessions and add aggregated publication data
+    full_frame = pd.concat([matched_frame, ensembl_frame, ncbi_frame], ignore_index=True)
+    full_frame.drop_duplicates(subset="ID", inplace=True)
+    full_frame = full_frame.merge(function_df, how="left", on="ID")
+    full_frame["publications"] = full_frame["publications"].apply(
+        lambda refs: refs if isinstance(refs, list) else []
     )
 
-    ## Try to ensure one entry per URS_taxid
-    full_frame.drop_duplicates(subset="urs_taxid", inplace=True)
-
     ## Tidy up and apply some normalisations
-    full_frame["publications"] = full_frame.apply(condense_publications, axis="columns")
     full_frame["Chain"] = full_frame["Chain"].apply(
-        lambda x: chain_normalisation.get(x, None)
+        lambda x: chain_normalisation.get(str(x).lower(), None) if pd.notna(x) else None
     )
     full_frame["so_type"] = full_frame["Class"].apply(
         lambda x: type_normalisation.get(x, "SO:0000655")
     )
 
-    ## Tidy up and rename some columns
-    full_frame.drop(
-        columns=[
-            "Species category",
-            "peptide",
-            "circRNA",
-            "exosome",
-            "structure",
-            "Disease category",
-            "Methods_x",
-            "Sample",
-            "Expression pattern",
-            "Dysfunction type",
-            "Description of disease/function",
-            "Source",
-            "drug Resistance/chemoresistance/stress",
-            "PDBlink",
-            "Description of interaction",
-            "Methods_y",
-        ],
-        inplace=True,
-    )
-
     full_frame.replace({np.nan: None}, inplace=True)
 
     ## yield entry objects for each row in the frame, these get written directly.
diff --git a/tests/databases/evlncrnas/__init__.py b/tests/databases/evlncrnas/__init__.py
@@ -0,0 +1 @@
+# -*- coding: utf-8 -*-
diff --git a/tests/databases/evlncrnas/parser_test.py b/tests/databases/evlncrnas/parser_test.py
@@ -0,0 +1,77 @@
+# -*- coding: utf-8 -*-
+
+import json
+from pathlib import Path
+
+import pandas as pd
+
+from rnacentral_pipeline.databases.evlncrnas import helpers, parser
+
+
+def test_can_parse_evlncrnas3_fixture(monkeypatch, tmp_path):
+    fixture_dir = Path("data/evlncrnas")
+
+    monkeypatch.setattr(
+        parser,
+        "handled_phylogeny",
+        lambda species: {
+            "Homo sapiens": 9606,
+        }.get(species),
+    )
+
+    def enrich(frame):
+        enriched = frame.copy()
+        enriched["sequence"] = "ACGTACGTACGT"
+        enriched["assembly_id"] = "GRCh38"
+        enriched["chromosome"] = "11"
+        enriched["region_start"] = 65497688
+        enriched["region_stop"] = 65506516
+        enriched["chain"] = "+"
+        return enriched, enriched.iloc[0:0].copy()
+
+    def fake_empty_matches(_frame, _dump):
+        return pd.DataFrame(
+            columns=[
+                "ID",
+                "external_id",
+                "lookup_name",
+                "urs",
+                "taxid",
+                "is_exact_match",
+            ]
+        )
+
+    monkeypatch.setattr(parser, "get_ensembl_accessions", enrich)
+    monkeypatch.setattr(parser, "get_ncbi_accessions", enrich)
+    monkeypatch.setattr(parser, "get_db_matches", fake_empty_matches)
+    monkeypatch.setattr(parser.lookup, "as_mapping", lambda *_args, **_kwargs: {})
+    monkeypatch.setattr(helpers, "lineage", lambda _record: "Eukaryota; Metazoa; Mammalia")
+
+    dump = tmp_path / "ev_lookup.csv"
+    dump.write_text("urs,taxid,external_id\n", encoding="utf-8")
+
+    entries = list(parser.parse(fixture_dir, (dump,), "postgres://ignored"))
+
+    assert len(entries) == 1
+
+    entry = entries[0]
+    assert entry.primary_id == "EVLNCRNAS:EL3692"
+    assert entry.accession == "EVLNCRNAS:EL3692"
+    assert entry.ncbi_tax_id == 9606
+    assert entry.database == "EVLNCRNAS"
+    assert entry.sequence == "ACGTACGTACGT"
+    assert entry.rna_type == "SO:0001463"
+    assert entry.url == "https://www.sdklab-biophysics-dzu.net/EVLncRNAs3/#/detail?id=EL3692"
+    assert entry.gene_synonyms == [
+        "MALAT1",
+        "HCN",
+        "LINC00047",
+        "NCRNA00047",
+        "NEAT2",
+        "PRO2853",
+    ]
+    assert sorted(ref.external_id for ref in entry.references) == ["31174563", "33235630"]
+    assert entry.note_data == {}
+    assert json.loads(entry.note) == {
+        "url": "https://www.sdklab-biophysics-dzu.net/EVLncRNAs3/#/detail?id=EL3692"
+    }
diff --git a/uv.lock b/uv.lock
diff --git a/workflows/databases/evlncrnas.nf b/workflows/databases/evlncrnas.nf

Original file line number	Diff line number	Diff line change
`@@ -0,0 +1,3 @@`
	`1`	`+ID LncRNA name Species Biological processes Cellular component Molecular fucntions Clinical applications Disease Disease category drug Resistance/chemoresistance/stress Mutation Methods Sample Organoid Expression pattern Function type Interaction target Mode of interaction Expression pattern of Interaction target Level of interaction Type of interaction Detailed Pathway PMID Year Description of disease/function/interaction`
	`2`	`+EL3692 MALAT1 Homo sapiens Disease-relevant functions Gene regulation Disease diagnosis gallbladder cancer Cancer Microarray, knockdown in GBC tissues and cell lines. up-regulated interaction ABI3BP Regulate lncRNA-DNA regulation MALAT1 down-regulates ABI3BP through EZH2 silencing of H3K27 methylation 31174563 2019 Long noncoding RNA MALAT1 potentiates growth and inhibits senescence by antagonizing ABI3BP in gallbladder cancer cells.`
	`3`	`+EL3692 MALAT1 Homo sapiens Apoptosis Transcription regulation Disease treatment endometriosis Others knockdown, Western blot endometriosis cell lines up-regulated interaction AMPK Regulate down-regulated lncRNA-mRNA regulation MALAT1 represses AMPK 33235630 2021 Therefore, MALAT1 may regulate GC proliferation via AMPK-mTOR-mediated cell apoptosis and autophagy.`
Original file line number	Diff line number	Diff line change
`@@ -0,0 +1,2 @@`
	`1`	`+ID LncRNA name Alias Species Species category Chromosome Start site End site Chain Exon NO. Class Assembly NCBI accession Ensembl Peptide Peptide name Peptide length Peptide sequence CircRNA Exosome Structure PDBlink Phase separation Location ChatGPT Homologous LncRNA`
	`2`	`+EL3692 MALAT1 HCN, LINC00047, NCRNA00047, NEAT2, PRO2853 Homo sapiens Human 11 65497738 65506516 plus 2 lincRNA GRCh38.p14 NR_002819, NR_144567, NR_144568 ENSG00000251562 1 1 1 4PLX 1 Chromatin, Cytoplasm, Cytosol, Mitochondrion, Nuclear, Nucleus, Nuclear speckle, Nucleoplasm, Exosome, Speckle periphery EL3691;EL3693;EL3694;EL3695`