Skip to content

Commit 83a71aa

Browse files
committed
Update evlncrnas parser for version 3 and added tests
1 parent 250d48d commit 83a71aa

File tree

9 files changed

+192
-99
lines changed

9 files changed

+192
-99
lines changed
Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,3 @@
1+
ID LncRNA name Species Biological processes Cellular component Molecular fucntions Clinical applications Disease Disease category drug Resistance/chemoresistance/stress Mutation Methods Sample Organoid Expression pattern Function type Interaction target Mode of interaction Expression pattern of Interaction target Level of interaction Type of interaction Detailed Pathway PMID Year Description of disease/function/interaction
2+
EL3692 MALAT1 Homo sapiens Disease-relevant functions Gene regulation Disease diagnosis gallbladder cancer Cancer Microarray, knockdown in GBC tissues and cell lines. up-regulated interaction ABI3BP Regulate lncRNA-DNA regulation MALAT1 down-regulates ABI3BP through EZH2 silencing of H3K27 methylation 31174563 2019 Long noncoding RNA MALAT1 potentiates growth and inhibits senescence by antagonizing ABI3BP in gallbladder cancer cells.
3+
EL3692 MALAT1 Homo sapiens Apoptosis Transcription regulation Disease treatment endometriosis Others knockdown, Western blot endometriosis cell lines up-regulated interaction AMPK Regulate down-regulated lncRNA-mRNA regulation MALAT1 represses AMPK 33235630 2021 Therefore, MALAT1 may regulate GC proliferation via AMPK-mTOR-mediated cell apoptosis and autophagy.
Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,2 @@
1+
ID LncRNA name Alias Species Species category Chromosome Start site End site Chain Exon NO. Class Assembly NCBI accession Ensembl Peptide Peptide name Peptide length Peptide sequence CircRNA Exosome Structure PDBlink Phase separation Location ChatGPT Homologous LncRNA
2+
EL3692 MALAT1 HCN, LINC00047, NCRNA00047, NEAT2, PRO2853 Homo sapiens Human 11 65497738 65506516 plus 2 lincRNA GRCh38.p14 NR_002819, NR_144567, NR_144568 ENSG00000251562 1 1 1 4PLX 1 Chromatin, Cytoplasm, Cytosol, Mitochondrion, Nuclear, Nucleus, Nuclear speckle, Nucleoplasm, Exosome, Speckle periphery EL3691;EL3693;EL3694;EL3695

pyproject.toml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -36,6 +36,7 @@ dependencies = [
3636
"throttler>=1.2.0",
3737
"nltk>=3.8.1",
3838
"openpyxl>=3.0.10",
39+
"xlrd>=2.0.1",
3940
"pybedtools>=0.12.0",
4041
"psycopg2-binary>=2.9.7",
4142
"polars>=1.14.0",

rnacentral_pipeline/databases/evlncrnas/helpers.py

Lines changed: 3 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,4 @@
1-
import numpy as np
2-
3-
from rnacentral_pipeline.databases.data import Entry, Exon, SequenceRegion
1+
from rnacentral_pipeline.databases.data import Entry
42
from rnacentral_pipeline.databases.helpers import phylogeny as phy
53

64

@@ -21,13 +19,11 @@ def sequence(record):
2119

2220

2321
def aliases(record):
24-
print(f'Name: {record["external_id"]}, aliases: {record["Aliases"]}')
25-
2622
if record["Aliases"] is None:
2723
return [str(record["external_id"])]
2824

2925
aliases = [str(record["external_id"])]
30-
aliases.extend(str(record["Aliases"]).split(","))
26+
aliases.extend(a.strip() for a in str(record["Aliases"]).split(",") if a.strip())
3127

3228
return aliases
3329

@@ -41,10 +37,7 @@ def rna_type(record):
4137

4238

4339
def url(record):
44-
return (
45-
"https://www.sdklab-biophysics-dzu.net/EVLncRNAs2/index.php/Home/Browsc/rna.html?id="
46-
+ record["ID"]
47-
)
40+
return f"https://www.sdklab-biophysics-dzu.net/EVLncRNAs3/#/detail?id={record['ID']}"
4841

4942

5043
def description(record):

rnacentral_pipeline/databases/evlncrnas/parser.py

Lines changed: 89 additions & 86 deletions
Original file line numberDiff line numberDiff line change
@@ -30,6 +30,7 @@
3030

3131
from rnacentral_pipeline.databases import data
3232
from rnacentral_pipeline.databases.data import Entry, Exon, SequenceRegion
33+
from rnacentral_pipeline.databases.helpers import publications as pubs
3334
from rnacentral_pipeline.databases.helpers import phylogeny as phy
3435
from rnacentral_pipeline.rnacentral import lookup
3536

@@ -78,11 +79,34 @@ def handled_phylogeny(species: str) -> int:
7879

7980

8081
def condense_publications(record):
81-
pubs_list = [record["PMID_x"]]
82-
if record["PMID_y"] and not record["PMID_y"] in pubs_list:
83-
pubs_list.append(record["PMID_y"])
84-
85-
return pubs_list
82+
references = []
83+
seen = set()
84+
for value in record:
85+
if pd.isna(value):
86+
continue
87+
try:
88+
pmid = int(value)
89+
except (TypeError, ValueError):
90+
continue
91+
if pmid in seen:
92+
continue
93+
seen.add(pmid)
94+
references.append(pubs.reference(pmid))
95+
return references
96+
97+
98+
def resolve_sheet(db_dir: Path, basename: str) -> Path:
99+
for suffix in (".xls", ".tsv"):
100+
candidate = db_dir.joinpath(f"{basename}{suffix}")
101+
if candidate.exists():
102+
return candidate
103+
raise FileNotFoundError(f"Could not find {basename}.xls or {basename}.tsv in {db_dir}")
104+
105+
106+
def load_table(path: Path) -> pd.DataFrame:
107+
if path.suffix == ".tsv":
108+
return pd.read_csv(path, sep="\t")
109+
return pd.read_excel(path)
86110

87111

88112
def split(input_frame: pd.DataFrame) -> tuple[pd.DataFrame, pd.DataFrame, pd.DataFrame]:
@@ -96,9 +120,10 @@ def split(input_frame: pd.DataFrame) -> tuple[pd.DataFrame, pd.DataFrame, pd.Dat
96120
subset="taxid"
97121
)
98122
print("NCBI missing done")
99-
e_accessions = no_accessions[no_accessions["Ensembl"].notna()]
123+
e_accessions = no_accessions[no_accessions["Ensembl"].notna()].copy()
100124
print("ensembl subset done")
101-
ncbi_accessions = input_frame[input_frame["NCBI accession"].notna()]
125+
no_accessions = no_accessions[no_accessions["Ensembl"].isna()].copy()
126+
ncbi_accessions = input_frame[input_frame["NCBI accession"].notna()].copy()
102127
print("NCBI subset done")
103128
return (no_accessions, e_accessions, ncbi_accessions)
104129

@@ -199,60 +224,75 @@ def pull_ensembl_data(e_id: str):
199224

200225

201226
def get_db_matches(match_frame_in: pd.DataFrame, db_dump: Path) -> pd.DataFrame:
202-
def split_clean_aliases(al):
203-
if al:
204-
return [a.strip() for a in str(al).split(",")]
205-
return np.nan
227+
def lookup_names(row):
228+
names = [str(row["external_id"]).strip()]
229+
aliases = row.get("Aliases")
230+
if pd.notna(aliases):
231+
names.extend(a.strip() for a in str(aliases).split(",") if a.strip())
232+
return names
206233

207234
match_frame = match_frame_in.copy()
208235
match_frame["taxid"] = match_frame["taxid"].astype(int)
209-
210-
match_frame.rename(columns={"Name": "external_id"}, inplace=True)
211-
match_frame["external_id"] = match_frame["external_id"].apply(split_clean_aliases)
236+
match_frame["lookup_name"] = match_frame.apply(lookup_names, axis="columns")
212237
match_frame = (
213-
match_frame.explode("external_id")
238+
match_frame.explode("lookup_name")
214239
.replace(to_replace=["None"], value=np.nan)
215-
.dropna(subset="external_id")
240+
.dropna(subset="lookup_name")
241+
)
242+
match_frame["is_exact_match"] = (
243+
match_frame["lookup_name"] == match_frame["external_id"]
216244
)
217245

218-
rnc_data = pd.read_csv(db_dump, names=["urs", "taxid", "external_id"], header=0)
219-
rnc_data["external_id"] = rnc_data["external_id"].apply(lambda x: str(x).split("|"))
246+
rnc_data = pd.read_csv(db_dump, names=["urs", "taxid", "lookup_name"], header=0)
247+
rnc_data["lookup_name"] = rnc_data["lookup_name"].apply(lambda x: str(x).split("|"))
220248
rnc_data = (
221-
rnc_data.explode("external_id")
249+
rnc_data.explode("lookup_name")
222250
.replace(to_replace=["", None], value=np.nan)
223-
.dropna(subset="external_id")
251+
.dropna(subset="lookup_name")
224252
)
225253

226254
matches = match_frame.merge(
227255
rnc_data,
228-
left_on=["external_id", "taxid"],
229-
right_on=["external_id", "taxid"],
256+
left_on=["lookup_name", "taxid"],
257+
right_on=["lookup_name", "taxid"],
230258
how="inner",
231259
)
260+
matches.sort_values(["ID", "is_exact_match"], ascending=[True, False], inplace=True)
232261

233262
return matches
234263

235264

265+
def load_function_data(function_info: Path) -> pd.DataFrame:
266+
function_df = load_table(function_info)
267+
return (
268+
function_df.groupby("ID", sort=False)["PMID"]
269+
.apply(condense_publications)
270+
.reset_index(name="publications")
271+
)
272+
273+
236274
def parse(db_dir: Path, db_dumps: tuple[Path], db_url: str) -> None:
237275
"""
238-
Parses the 3 excel sheets using pandas and joins them into one massive table
239-
which is then parsed to produce entries
276+
Parse and join the two EVLncRNAs3 workbooks and build RNAcentral entries.
240277
"""
241-
lncRNA = db_dir.joinpath("lncRNA.xlsx")
242-
interaction = db_dir.joinpath("interaction2.xlsx")
243-
disease = db_dir.joinpath("disease2.xlsx")
244-
245-
assert lncRNA.exists() and interaction.exists() and disease.exists()
246-
247-
lncRNA_df = pd.read_excel(lncRNA)
248-
interaction_df = pd.read_excel(interaction)
249-
disease_df = pd.read_excel(disease)
278+
lncRNA = resolve_sheet(db_dir, "lncRNA_information")
279+
function_info = resolve_sheet(db_dir, "function_information")
280+
281+
lncRNA_df = load_table(lncRNA)
282+
function_df = load_function_data(function_info)
283+
lncRNA_df.rename(
284+
columns={
285+
"LncRNA name": "external_id",
286+
"Alias": "Aliases",
287+
},
288+
inplace=True,
289+
)
250290

251-
print("Loaded 3 sheets...")
291+
print("Loaded EVLncRNAs3 sheets...")
252292

253-
lncRNA_df["taxid"] = (
254-
lncRNA_df["Species"].apply(handled_phylogeny).dropna().astype(int)
255-
)
293+
lncRNA_df["taxid"] = lncRNA_df["Species"].apply(handled_phylogeny)
294+
lncRNA_df = lncRNA_df.dropna(subset=["taxid"]).copy()
295+
lncRNA_df["taxid"] = lncRNA_df["taxid"].astype(int)
256296

257297
## Split the data on the presence of accessions for either NCBI or Ensembl
258298
no_accession_frame, ensembl_frame, ncbi_frame = split(lncRNA_df)
@@ -272,12 +312,12 @@ def parse(db_dir: Path, db_dumps: tuple[Path], db_url: str) -> None:
272312
## Match with RNAcentral based on the gene name
273313
## This is optionally chunked to save memory -
274314
## split the lookup file and provide a list on the commandline
275-
matched_frame = pd.concat(
276-
[get_db_matches(no_accession_frame, dump_chunk) for dump_chunk in db_dumps]
315+
matched_chunks = [get_db_matches(no_accession_frame, dump_chunk) for dump_chunk in db_dumps]
316+
matched_frame = pd.concat(matched_chunks, ignore_index=True)
317+
matched_frame.drop_duplicates(subset="ID", inplace=True)
318+
matched_frame["urs_taxid"] = (
319+
matched_frame["urs"] + "_" + matched_frame["taxid"].astype(str)
277320
)
278-
matched_frame["taxid"] = matched_frame["taxid"].astype(str)
279-
matched_frame["urs_taxid"] = matched_frame[["urs", "taxid"]].agg("_".join, axis=1)
280-
matched_frame.drop_duplicates(subset="urs_taxid", inplace=True)
281321

282322
## Look up the rest of the data for the hits
283323
mapping = lookup.as_mapping(db_url, matched_frame["urs_taxid"].values, QUERY)
@@ -289,59 +329,22 @@ def parse(db_dir: Path, db_dumps: tuple[Path], db_url: str) -> None:
289329
lambda x: mapping[x]["sequence"]
290330
)
291331

292-
## Build frame with all hits & accessions
293-
## The full frame is then merged with the disease and interaction frames
294-
full_frame = pd.concat([matched_frame, ensembl_frame, ncbi_frame])
295-
296-
full_frame = full_frame.merge(
297-
disease_df.drop(
298-
columns=["Name", "Species", "Species category", "exosome", "structure"]
299-
),
300-
how="left",
301-
on="ID",
302-
)
303-
304-
full_frame = full_frame.merge(
305-
interaction_df.drop(columns=["Name", "Species", "Species category"]),
306-
how="left",
307-
on="ID",
332+
## Build frame with all hits & accessions and add aggregated publication data
333+
full_frame = pd.concat([matched_frame, ensembl_frame, ncbi_frame], ignore_index=True)
334+
full_frame.drop_duplicates(subset="ID", inplace=True)
335+
full_frame = full_frame.merge(function_df, how="left", on="ID")
336+
full_frame["publications"] = full_frame["publications"].apply(
337+
lambda refs: refs if isinstance(refs, list) else []
308338
)
309339

310-
## Try to ensure one entry per URS_taxid
311-
full_frame.drop_duplicates(subset="urs_taxid", inplace=True)
312-
313340
## Tidy up and apply some normalisations
314-
full_frame["publications"] = full_frame.apply(condense_publications, axis="columns")
315341
full_frame["Chain"] = full_frame["Chain"].apply(
316-
lambda x: chain_normalisation.get(x, None)
342+
lambda x: chain_normalisation.get(str(x).lower(), None) if pd.notna(x) else None
317343
)
318344
full_frame["so_type"] = full_frame["Class"].apply(
319345
lambda x: type_normalisation.get(x, "SO:0000655")
320346
)
321347

322-
## Tidy up and rename some columns
323-
full_frame.drop(
324-
columns=[
325-
"Species category",
326-
"peptide",
327-
"circRNA",
328-
"exosome",
329-
"structure",
330-
"Disease category",
331-
"Methods_x",
332-
"Sample",
333-
"Expression pattern",
334-
"Dysfunction type",
335-
"Description of disease/function",
336-
"Source",
337-
"drug Resistance/chemoresistance/stress",
338-
"PDBlink",
339-
"Description of interaction",
340-
"Methods_y",
341-
],
342-
inplace=True,
343-
)
344-
345348
full_frame.replace({np.nan: None}, inplace=True)
346349

347350
## yield entry objects for each row in the frame, these get written directly.
Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
# -*- coding: utf-8 -*-
Lines changed: 77 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,77 @@
1+
# -*- coding: utf-8 -*-
2+
3+
import json
4+
from pathlib import Path
5+
6+
import pandas as pd
7+
8+
from rnacentral_pipeline.databases.evlncrnas import helpers, parser
9+
10+
11+
def test_can_parse_evlncrnas3_fixture(monkeypatch, tmp_path):
12+
fixture_dir = Path("data/evlncrnas")
13+
14+
monkeypatch.setattr(
15+
parser,
16+
"handled_phylogeny",
17+
lambda species: {
18+
"Homo sapiens": 9606,
19+
}.get(species),
20+
)
21+
22+
def enrich(frame):
23+
enriched = frame.copy()
24+
enriched["sequence"] = "ACGTACGTACGT"
25+
enriched["assembly_id"] = "GRCh38"
26+
enriched["chromosome"] = "11"
27+
enriched["region_start"] = 65497688
28+
enriched["region_stop"] = 65506516
29+
enriched["chain"] = "+"
30+
return enriched, enriched.iloc[0:0].copy()
31+
32+
def fake_empty_matches(_frame, _dump):
33+
return pd.DataFrame(
34+
columns=[
35+
"ID",
36+
"external_id",
37+
"lookup_name",
38+
"urs",
39+
"taxid",
40+
"is_exact_match",
41+
]
42+
)
43+
44+
monkeypatch.setattr(parser, "get_ensembl_accessions", enrich)
45+
monkeypatch.setattr(parser, "get_ncbi_accessions", enrich)
46+
monkeypatch.setattr(parser, "get_db_matches", fake_empty_matches)
47+
monkeypatch.setattr(parser.lookup, "as_mapping", lambda *_args, **_kwargs: {})
48+
monkeypatch.setattr(helpers, "lineage", lambda _record: "Eukaryota; Metazoa; Mammalia")
49+
50+
dump = tmp_path / "ev_lookup.csv"
51+
dump.write_text("urs,taxid,external_id\n", encoding="utf-8")
52+
53+
entries = list(parser.parse(fixture_dir, (dump,), "postgres://ignored"))
54+
55+
assert len(entries) == 1
56+
57+
entry = entries[0]
58+
assert entry.primary_id == "EVLNCRNAS:EL3692"
59+
assert entry.accession == "EVLNCRNAS:EL3692"
60+
assert entry.ncbi_tax_id == 9606
61+
assert entry.database == "EVLNCRNAS"
62+
assert entry.sequence == "ACGTACGTACGT"
63+
assert entry.rna_type == "SO:0001463"
64+
assert entry.url == "https://www.sdklab-biophysics-dzu.net/EVLncRNAs3/#/detail?id=EL3692"
65+
assert entry.gene_synonyms == [
66+
"MALAT1",
67+
"HCN",
68+
"LINC00047",
69+
"NCRNA00047",
70+
"NEAT2",
71+
"PRO2853",
72+
]
73+
assert sorted(ref.external_id for ref in entry.references) == ["31174563", "33235630"]
74+
assert entry.note_data == {}
75+
assert json.loads(entry.note) == {
76+
"url": "https://www.sdklab-biophysics-dzu.net/EVLncRNAs3/#/detail?id=EL3692"
77+
}

0 commit comments

Comments
 (0)