Skip to content

Commit f179b3f

Browse files
authored
Curie detection patch (#213)
* https => http * test if CURIE is legit * Create omim.json * clipped omim file for test * injected prefix_map * anchored bandit to avoid prev GH action error * added omim to prefix_map in test * added 's' * #214 check if NOT an empty list * fixes #214 check if NOT an empty list * fixes sssom parse creates empty columns #214 * reformatted * fixes #214 * cleanup * continue instead of break * formatting * fixed error: str obj has no attr 'write' * formatted * passes tests * rolled back prev edit * added some code to generate ptables for diseaseMap * exploded list for row instead of printing a list * formatted * reverted to simple solution * reverted to simple solution * updated test file for another test * Draft1 for prefix_reconciliation addresses #216 * cleanup Prefix reconciliation rules #216 * missed final assignments * test for Prefix reconciliation rules #216 * flake8 and mypy compliant * reformated * changed logic:dict passed rather than yaml path * type correction * fixed prefix_recon * format * implemented use of SchemaView over with open * added prefix reconciliation to merge CLI * added constats for SchemaView * changed param * added dynamic value assignment * reshuffled reconcile_prefixes * flake8 compliant
1 parent bf9c32b commit f179b3f

14 files changed

+353
-6
lines changed

sssom/__init__.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -9,4 +9,5 @@
99
filter_redundant_rows,
1010
group_mappings,
1111
parse,
12+
reconcile_prefix_and_data,
1213
)

sssom/cli.py

Lines changed: 25 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -38,6 +38,7 @@
3838
dataframe_to_ptable,
3939
filter_redundant_rows,
4040
merge_msdf,
41+
reconcile_prefix_and_data,
4142
remove_unmatched,
4243
to_mapping_set_dataframe,
4344
)
@@ -192,7 +193,7 @@ def ptable(input, output: TextIO, inverse_factor):
192193
# , priors=list(priors)
193194
rows = dataframe_to_ptable(df)
194195
for row in rows:
195-
print(row, sep="\t", file=output)
196+
print(*row, sep="\t", file=output)
196197

197198

198199
@main.command()
@@ -434,7 +435,6 @@ def merge(inputs: str, output: TextIO, reconcile: bool = True):
434435
""" # noqa: DAR101
435436
msdfs = [read_sssom_table(i) for i in inputs]
436437
merged_msdf = merge_msdf(*msdfs, reconcile=reconcile)
437-
# Export MappingSetDataFrame into a TSV
438438
write_table(merged_msdf, output)
439439

440440

@@ -472,5 +472,28 @@ def rewire(
472472
print(rdfstr, file=output)
473473

474474

475+
@main.command()
476+
@input_argument
477+
@click.option(
478+
"-p",
479+
"--reconcile-prefix-file",
480+
help="Provide YAML file with prefix reconciliation information.",
481+
)
482+
@output_option
483+
def reconcile_prefixes(input: str, reconcile_prefix_file: Path, output: TextIO):
484+
"""
485+
Reconcile prefix_map based on provided YAML file.
486+
487+
:param input: MappingSetDataFrame filename
488+
:param reconcile_prefix_file: YAML file containing the prefix reconcilation rules.
489+
:param output: Target file path.
490+
"""
491+
msdf = read_sssom_table(input)
492+
with open(reconcile_prefix_file, "rb") as rp_file:
493+
rp_dict = yaml.safe_load(rp_file)
494+
recon_msdf = reconcile_prefix_and_data(msdf, rp_dict)
495+
write_table(recon_msdf, output)
496+
497+
475498
if __name__ == "__main__":
476499
main()

sssom/constants.py

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3,5 +3,13 @@
33
import os
44
import pathlib
55

6+
from linkml_runtime.utils.schema_as_dict import schema_as_dict
7+
from linkml_runtime.utils.schemaview import SchemaView
8+
69
HERE = pathlib.Path(__file__).parent.resolve()
710
SCHEMA_YAML = os.path.join(HERE, "sssom.yaml")
11+
12+
SCHEMA_VIEW = SchemaView(SCHEMA_YAML)
13+
SCHEMA_DICT = schema_as_dict(SCHEMA_VIEW.schema)
14+
MAPPING_SLOTS = SCHEMA_DICT["classes"]["mapping"]["slots"]
15+
MAPPING_SET_SLOTS = SCHEMA_DICT["classes"]["mapping set"]["slots"]

sssom/util.py

Lines changed: 122 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -31,7 +31,7 @@
3131
import yaml
3232
from linkml_runtime.linkml_model.types import Uriorcurie
3333

34-
from .constants import SCHEMA_YAML
34+
from .constants import SCHEMA_DICT, SCHEMA_YAML
3535
from .context import SSSOM_URI_PREFIX, get_default_metadata, get_jsonld_context
3636
from .internal_context import multivalued_slots
3737
from .sssom_datamodel import Mapping as SSSOM_Mapping
@@ -414,6 +414,12 @@ def dataframe_to_ptable(df: pd.DataFrame, *, inverse_factor: float = 0.5):
414414
predicate_type = PREDICATE_SIBLING
415415
elif predicate == "dbpedia-owl:different":
416416
predicate_type = PREDICATE_SIBLING
417+
# * Added by H2 ############################
418+
elif predicate == "oboInOwl:hasDbXref":
419+
predicate_type = PREDICATE_HAS_DBXREF
420+
elif predicate == "skos:relatedMatch":
421+
predicate_type = PREDICATE_RELATED_MATCH
422+
# * ########################################
417423
else:
418424
raise ValueError(f"Unhandled predicate: {predicate}")
419425

@@ -445,6 +451,22 @@ def dataframe_to_ptable(df: pd.DataFrame, *, inverse_factor: float = 0.5):
445451
inverse_confidence,
446452
confidence,
447453
)
454+
# * Added by H2 ############################
455+
elif predicate_type == PREDICATE_HAS_DBXREF:
456+
ps = (
457+
residual_confidence,
458+
residual_confidence,
459+
confidence,
460+
inverse_confidence,
461+
)
462+
elif predicate_type == PREDICATE_RELATED_MATCH:
463+
ps = (
464+
residual_confidence,
465+
residual_confidence,
466+
confidence,
467+
inverse_confidence,
468+
)
469+
# * #########################################
448470
else:
449471
raise ValueError(f"predicate: {predicate_type}")
450472
row = [subject_id, object_id] + [str(p) for p in ps]
@@ -456,6 +478,10 @@ def dataframe_to_ptable(df: pd.DataFrame, *, inverse_factor: float = 0.5):
456478
PREDICATE_SUPERCLASS = 1
457479
PREDICATE_EQUIVALENT = 2
458480
PREDICATE_SIBLING = 3
481+
# * Added by H2 ############################
482+
PREDICATE_HAS_DBXREF = 4
483+
PREDICATE_RELATED_MATCH = 5
484+
# * ########################################
459485

460486
RDF_FORMATS = {"ttl", "turtle", "nt", "xml"}
461487

@@ -774,13 +800,25 @@ def to_mapping_set_dataframe(doc: MappingSetDocument) -> MappingSetDataFrame:
774800
:return: MappingSetDataFrame object
775801
"""
776802
data = []
803+
slots_with_double_as_range = [
804+
s
805+
for s in SCHEMA_DICT["slots"].keys()
806+
if SCHEMA_DICT["slots"][s]["range"] == "double"
807+
]
777808
if doc.mapping_set.mappings is not None:
778809
for mapping in doc.mapping_set.mappings:
779810
m = get_dict_from_mapping(mapping)
780811
data.append(m)
781812
df = pd.DataFrame(data=data)
782813
meta = extract_global_metadata(doc)
783814
meta.pop(PREFIX_MAP_KEY, None)
815+
# The following 3 lines are to remove columns
816+
# where all values are blank.
817+
df.replace("", np.nan, inplace=True)
818+
df = df.dropna(axis=1, how="all") # remove columns with all row = 'None'-s.
819+
df.loc[:, ~df.columns.isin(slots_with_double_as_range)].replace(
820+
np.nan, "", inplace=True
821+
)
784822
msdf = MappingSetDataFrame(df=df, prefix_map=doc.prefix_map, metadata=meta)
785823
return msdf
786824

@@ -854,7 +892,12 @@ def curie_from_uri(uri: str, prefix_map: Mapping[str, str]) -> str:
854892
uri_prefix = prefix_map[prefix]
855893
if uri.startswith(uri_prefix):
856894
remainder = uri.replace(uri_prefix, "")
857-
return f"{prefix}:{remainder}"
895+
curie = f"{prefix}:{remainder}"
896+
if is_curie(curie):
897+
return f"{prefix}:{remainder}"
898+
else:
899+
logging.warning(f"{prefix}:{remainder} is not a CURIE ... skipping")
900+
continue
858901
raise NoCURIEException(f"{uri} does not follow any known prefixes")
859902

860903

@@ -965,3 +1008,80 @@ def is_multivalued_slot(slot: str) -> bool:
9651008
# return view.get_slot(slot).multivalued
9661009

9671010
return slot in multivalued_slots
1011+
1012+
1013+
def reconcile_prefix_and_data(
1014+
msdf: MappingSetDataFrame, prefix_reconciliation: dict
1015+
) -> MappingSetDataFrame:
1016+
"""Reconciles prefix_map and translates CURIE switch in dataframe.
1017+
1018+
:param msdf: Mapping Set DataFrame.
1019+
:param prefix_reconciliation: Prefix reconcilation dictionary from a YAML file
1020+
:return: Mapping Set DataFrame with reconciled prefix_map and data.
1021+
"""
1022+
# Discussion about this found here:
1023+
# https://github.com/mapping-commons/sssom-py/issues/216#issue-1171701052
1024+
1025+
prefix_map = msdf.prefix_map
1026+
df: pd.DataFrame = msdf.df
1027+
data_switch_dict = dict()
1028+
1029+
prefix_synonyms = prefix_reconciliation["prefix_synonyms"]
1030+
prefix_expansion = prefix_reconciliation["prefix_expansion_reconciliation"]
1031+
1032+
# The prefix exists but the expansion needs to be updated.
1033+
expansion_replace = {
1034+
k: v
1035+
for k, v in prefix_expansion.items()
1036+
if k in prefix_map.keys() and v != prefix_map[k]
1037+
}
1038+
1039+
# Updates expansions in prefix_map
1040+
prefix_map.update(expansion_replace)
1041+
1042+
# Prefixes that need to be replaced
1043+
# IF condition:
1044+
# 1. Key OR Value in prefix_synonyms are keys in prefix_map
1045+
# e.g.: ICD10: ICD10CM - either should be present within
1046+
# the prefix_map.
1047+
# AND
1048+
# 2. Value in prefix_synonyms is NOT a value in expansion_replace.
1049+
# In other words, the existing expansion do not match the YAML.
1050+
1051+
prefix_replace = [
1052+
k
1053+
for k, v in prefix_synonyms.items()
1054+
if (k in prefix_map.keys() or v in prefix_map.keys())
1055+
and v not in expansion_replace.keys()
1056+
]
1057+
1058+
if len(prefix_replace) > 0:
1059+
for pr in prefix_replace:
1060+
correct_prefix = prefix_synonyms[pr]
1061+
correct_expansion = prefix_expansion[correct_prefix]
1062+
prefix_map[correct_prefix] = correct_expansion
1063+
logging.info(f"Adding prefix_map {correct_prefix}: {correct_expansion}")
1064+
if pr in prefix_map.keys():
1065+
prefix_map.pop(pr, None)
1066+
data_switch_dict[pr] = correct_prefix
1067+
1068+
logging.warning(f"Replacing prefix {pr} with {correct_prefix}")
1069+
1070+
# Data editing
1071+
if len(data_switch_dict) > 0:
1072+
# Read schema file
1073+
slots = SCHEMA_DICT["slots"]
1074+
entity_reference_columns = [
1075+
k for k, v in slots.items() if v["range"] == "EntityReference"
1076+
]
1077+
update_columns = [c for c in df.columns if c in entity_reference_columns]
1078+
for k, v in data_switch_dict.items():
1079+
df[update_columns] = df[update_columns].replace(
1080+
k + ":", v + ":", regex=True
1081+
)
1082+
1083+
msdf.df = df
1084+
msdf.prefix_map = prefix_map
1085+
1086+
# TODO: When expansion of 2 prefixes in the prefix_map are the same.
1087+
return msdf

tests/constants.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -7,3 +7,5 @@
77

88
test_out_dir = cwd / "tmp"
99
test_out_dir.mkdir(parents=True, exist_ok=True)
10+
11+
prefix_recon_yaml = data_dir / "prefix_reconciliation.yaml"

tests/data/basic3.tsv

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,8 @@
88
# b: "http://example.org/b/"
99
# c: "http://example.org/c/"
1010
# d: "http://example.org/d/"
11+
# rdfs: "http://example.org/rdfs/"
12+
# owl: "http://example.org/owl/"
1113
subject_id subject_label predicate_id predicate_modifier object_id object_label match_type subject_source object_source mapping_tool confidence subject_match_field object_match_field subject_category object_category match_string comment
1214
c:something YYYYY owl:equivalentClass b:something yyyyyy Lexical c d rdf_matcher 0.81 rdfs:label rdfs:label biolink:AnatomicalEntity biolink:AnatomicalEntity xxxxx mock data
1315
d:something YYYYY owl:equivalentClass Not a:something yyyyyy Lexical d a rdf_matcher 0.82 rdfs:label rdfs:label biolink:AnatomicalEntity biolink:AnatomicalEntity xxxxx mock data

tests/data/omim.json

Lines changed: 124 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,124 @@
1+
{
2+
"graphs" : [ {
3+
"nodes" : [ {
4+
"id" : "http://omim.org/entry/602070",
5+
"meta" : {
6+
"synonyms" : [ {
7+
"pred" : "hasExactSynonym",
8+
"val" : "NRP2",
9+
"xrefs" : [ ]
10+
} ],
11+
"basicPropertyValues" : [ {
12+
"pred" : "http://purl.obolibrary.org/obo/IAO_0000142",
13+
"val" : "http://www.ncbi.nlm.nih.gov/pubmed/11891274"
14+
}, {
15+
"pred" : "http://www.w3.org/2004/02/skos/core#exactMatch",
16+
"val" : "npn2"
17+
}, {
18+
"pred" : "http://purl.obolibrary.org/obo/IAO_0000142",
19+
"val" : "http://www.ncbi.nlm.nih.gov/pubmed/14586460"
20+
}, {
21+
"pred" : "http://purl.obolibrary.org/obo/IAO_0000142",
22+
"val" : "http://www.ncbi.nlm.nih.gov/pubmed/10707970"
23+
}, {
24+
"pred" : "http://purl.obolibrary.org/obo/IAO_0000142",
25+
"val" : "http://www.ncbi.nlm.nih.gov/pubmed/10707971"
26+
}, {
27+
"pred" : "http://purl.obolibrary.org/obo/IAO_0000142",
28+
"val" : "http://www.ncbi.nlm.nih.gov/pubmed/30057110"
29+
}, {
30+
"pred" : "http://purl.obolibrary.org/obo/IAO_0000142",
31+
"val" : "http://www.ncbi.nlm.nih.gov/pubmed/11486090"
32+
}, {
33+
"pred" : "https://w3id.org/biolink/vocab/category",
34+
"val" : "https://w3id.org/biolink/vocab/Gene"
35+
}, {
36+
"pred" : "http://purl.obolibrary.org/obo/IAO_0000142",
37+
"val" : "http://www.ncbi.nlm.nih.gov/pubmed/8895455"
38+
}, {
39+
"pred" : "http://www.w3.org/2004/02/skos/core#exactMatch",
40+
"val" : "neuropilin 2"
41+
}, {
42+
"pred" : "http://purl.obolibrary.org/obo/IAO_0000142",
43+
"val" : "http://www.ncbi.nlm.nih.gov/pubmed/11112349"
44+
}, {
45+
"pred" : "http://purl.obolibrary.org/obo/IAO_0000142",
46+
"val" : "http://www.ncbi.nlm.nih.gov/pubmed/9288754"
47+
}, {
48+
"pred" : "http://purl.obolibrary.org/obo/IAO_0000142",
49+
"val" : "http://www.ncbi.nlm.nih.gov/pubmed/10329017"
50+
}, {
51+
"pred" : "http://purl.obolibrary.org/obo/IAO_0000142",
52+
"val" : "http://www.ncbi.nlm.nih.gov/pubmed/27026195"
53+
}, {
54+
"pred" : "http://www.w3.org/2004/02/skos/core#exactMatch",
55+
"val" : "vascular endothelial growth factor-165 receptor 2"
56+
}, {
57+
"pred" : "http://purl.obolibrary.org/obo/IAO_0000142",
58+
"val" : "http://www.ncbi.nlm.nih.gov/pubmed/20010807"
59+
} ]
60+
},
61+
"type" : "CLASS",
62+
"lbl" : "NRP2"
63+
}, {
64+
"id" : "http://omim.org/entry/602074",
65+
"meta" : {
66+
"synonyms" : [ {
67+
"pred" : "hasExactSynonym",
68+
"val" : "DAP3",
69+
"xrefs" : [ ]
70+
} ],
71+
"basicPropertyValues" : [ {
72+
"pred" : "http://purl.obolibrary.org/obo/IAO_0000142",
73+
"val" : "http://www.ncbi.nlm.nih.gov/pubmed/15179560"
74+
}, {
75+
"pred" : "http://purl.obolibrary.org/obo/IAO_0000142",
76+
"val" : "http://www.ncbi.nlm.nih.gov/pubmed/11279123"
77+
}, {
78+
"pred" : "http://www.w3.org/2004/02/skos/core#exactMatch",
79+
"val" : "death-associated protein 3"
80+
}, {
81+
"pred" : "http://purl.obolibrary.org/obo/IAO_0000142",
82+
"val" : "http://www.ncbi.nlm.nih.gov/pubmed/9284927"
83+
}, {
84+
"pred" : "http://purl.obolibrary.org/obo/IAO_0000142",
85+
"val" : "http://www.ncbi.nlm.nih.gov/pubmed/7499268"
86+
}, {
87+
"pred" : "http://www.w3.org/2004/02/skos/core#exactMatch",
88+
"val" : "mitochondrial ribosomal protein s29"
89+
}, {
90+
"pred" : "https://w3id.org/biolink/vocab/category",
91+
"val" : "https://w3id.org/biolink/vocab/Gene"
92+
} ]
93+
},
94+
"type" : "CLASS",
95+
"lbl" : "DAP3"
96+
},{
97+
"id" : "http://omim.org/entry/136100",
98+
"meta" : {
99+
"basicPropertyValues" : [ {
100+
"pred" : "http://purl.obolibrary.org/obo/IAO_0000142",
101+
"val" : "http://www.ncbi.nlm.nih.gov/pubmed/13869998"
102+
}, {
103+
"pred" : "http://www.w3.org/2004/02/skos/core#exactMatch",
104+
"val" : "2d:4d finger-length ratio"
105+
}, {
106+
"pred" : "http://www.w3.org/2004/02/skos/core#exactMatch",
107+
"val" : "fingers, relative length of"
108+
}, {
109+
"pred" : "http://purl.obolibrary.org/obo/IAO_0000142",
110+
"val" : "http://www.ncbi.nlm.nih.gov/pubmed/14943709"
111+
}, {
112+
"pred" : "http://purl.obolibrary.org/obo/IAO_0000142",
113+
"val" : "http://www.ncbi.nlm.nih.gov/pubmed/20982098"
114+
}, {
115+
"pred" : "http://purl.obolibrary.org/obo/IAO_0000142",
116+
"val" : "http://www.ncbi.nlm.nih.gov/pubmed/20303062"
117+
} ]
118+
},
119+
"type" : "CLASS",
120+
"lbl" : "fingers, relative length of"
121+
}
122+
]
123+
}]
124+
}

0 commit comments

Comments
 (0)