Skip to content

Commit 0c88536

Browse files
gouttegdcthoyt
andauthored
Add the infer_cardinality method. (#605)
Add a new `infer_cardinality` method to the `MappingSetDataFrame` to fill the `mapping_cardinality` slot with computed cardinality values. The approach used here is more or less a direct Python translation of my existing implementation in SSSOM-Java. The gist of it is that we iterate over the entire set of records a first time to populate two hash tables: one that associates a subject to all the different objects it is mapped to, and one that associates an object to all the different subjects it is mapped to. Then we can iterate over the records a second time, and for every record we can immediately get (1) the number of different objects mapped to the same subject and (2) the number of different subjects mapped to the same object; the combination of those two values gives us the cardinality we are looking for. To deal with the concept of "scope", the "subjects" and "objects" that we use to fill the hash tables are not made of only the "subject_id" slot or the "object_id" slot, but also of all the slots that define the scope. For example, if the scope is `["predicate_id"]`, then for the following record: subject_id predicate_id object_id DO:1234 skos:exactMatch HP:5678 the "subject" string will contain both `DO:1234` and `skos:exactMatch`, and the "object" string will contain both `HP:5678` and `skos:exactMatch`. Co-authored-by: Charles Tapley Hoyt <cthoyt@gmail.com>
1 parent b46329c commit 0c88536

File tree

7 files changed

+195
-0
lines changed

7 files changed

+195
-0
lines changed

src/sssom/constants.py

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -89,6 +89,7 @@
8989
MAPPING_SET_SOURCE = "mapping_set_source"
9090
MAPPING_SOURCE = "mapping_source"
9191
MAPPING_CARDINALITY = "mapping_cardinality"
92+
CARDINALITY_SCOPE = "cardinality_scope"
9293
MAPPING_TOOL = "mapping_tool"
9394
MAPPING_TOOL_VERSION = "mapping_tool_version"
9495
MAPPING_DATE = "mapping_date"
@@ -109,6 +110,10 @@
109110
SUBJECT_SOURCE_ID = "subject_source_id"
110111
OBJECT_SOURCE_ID = "object_source_id"
111112

113+
# Special value for "unmapped" entities
114+
# see <https://mapping-commons.github.io/sssom/spec-model/#representing-unmapped-entities>
115+
NO_TERM_FOUND = "sssom:NoTermFound"
116+
112117
# PREDICATES
113118
OWL_EQUIVALENT_CLASS = "owl:equivalentClass"
114119
OWL_EQUIVALENT_PROPERTY = "owl:equivalentProperty"

src/sssom/util.py

Lines changed: 103 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -23,12 +23,15 @@
2323
from sssom_schema import MappingSet, slots
2424

2525
from .constants import (
26+
CARDINALITY_SCOPE,
2627
COLUMN_INVERT_DICTIONARY,
2728
COMMENT,
2829
CONFIDENCE,
30+
MAPPING_CARDINALITY,
2931
MAPPING_JUSTIFICATION,
3032
MAPPING_SET_ID,
3133
MAPPING_SET_SOURCE,
34+
NO_TERM_FOUND,
3235
OBJECT_CATEGORY,
3336
OBJECT_ID,
3437
OBJECT_LABEL,
@@ -393,6 +396,106 @@ def condense(self) -> List[str]:
393396
self.df.drop(columns=condensed, inplace=True)
394397
return condensed
395398

399+
def infer_cardinality(self, scope: Optional[List[str]] = None) -> None:
400+
"""Infer cardinality values in the set.
401+
402+
This method will automatically fill the `mapping_cardinality` slot for
403+
all records in the set, overwriting any pre-existing values.
404+
405+
See <https://mapping-commons.github.io/sssom/spec-model/#mapping-cardinality-and-cardinality-scope>
406+
for more information about cardinality computation,
407+
<https://mapping-commons.github.io/sssom/spec-model/#literal-mappings>
408+
for how to deal with literal mapping records, and
409+
<https://mapping-commons.github.io/sssom/spec-model/#representing-unmapped-entities>
410+
for how to deal with mapping records involving `sssom:NoTermFound`.
411+
412+
:param scope: A list of slot names that defines the subset of the
413+
records in which cardinality will be computed. For
414+
example, with a scope of `['predicate_id']`, for any
415+
given record the cardinality will be computed relatively
416+
to the subset of records that have the same predicate.
417+
The default is an empty list, meaning that cardinality is
418+
computed relatively to the entire set of records.
419+
"""
420+
if scope is None:
421+
scope = []
422+
423+
#: Unique subjects for any given object
424+
subjects_by_object: defaultdict[str, set[str]] = defaultdict(set)
425+
#: Unique objects for any given subject
426+
objects_by_subject: defaultdict[str, set[str]] = defaultdict(set)
427+
428+
schema = SSSOMSchemaView()
429+
unknown_slots = [slot for slot in scope if slot not in schema.mapping_slots]
430+
if len(unknown_slots) > 0:
431+
logging.warning(f"Ignoring invalid slot name(s): {unknown_slots}.")
432+
scope = list(set(scope) - set(unknown_slots))
433+
434+
# Helper function to transform a row into a string that represents
435+
# a subject (or object) in a given scope; `side` is either `subject`
436+
# or `object`.
437+
def _to_string(row: dict[str, Any], side: str) -> str:
438+
# We prepend a one-letter code (`L` or `E`) to the actual subject
439+
# or object so that literal and non-literal mapping records are
440+
# always distinguishable and can be counted separately.
441+
if row.get(f"{side}_type") == "rdfs literal":
442+
s = "L\0" + row.get(f"{side}_label", "")
443+
else:
444+
s = "E\0" + row.get(f"{side}_id", "")
445+
for slot in scope:
446+
s += "\0" + row.get(slot, "")
447+
return s
448+
449+
# We iterate over the records a first time to collect the different
450+
# objects mapped to each subject and vice versa
451+
for _, row in self.df.iterrows():
452+
if row.get(SUBJECT_ID) == NO_TERM_FOUND or row.get(OBJECT_ID) == NO_TERM_FOUND:
453+
# Mappings to sssom:NoTermFound are ignored for cardinality computations
454+
continue
455+
456+
subj = _to_string(row, "subject")
457+
obj = _to_string(row, "object")
458+
459+
subjects_by_object[obj].add(subj)
460+
objects_by_subject[subj].add(obj)
461+
462+
# Second iteration to compute the actual cardinality values. Since we
463+
# must not modify a row while we are iterating over the dataframe, we
464+
# collect the values in a separate array.
465+
cards = []
466+
for _, row in self.df.iterrows():
467+
# Special cases involving sssom:NoTermFound on either side
468+
if row.get(SUBJECT_ID) == NO_TERM_FOUND:
469+
if row.get(OBJECT_ID) == NO_TERM_FOUND:
470+
cards.append("0:0")
471+
else:
472+
cards.append("0:1")
473+
elif row.get(OBJECT_ID) == NO_TERM_FOUND:
474+
cards.append("1:0")
475+
else:
476+
# General case
477+
n_subjects = len(subjects_by_object[_to_string(row, "object")])
478+
n_objects = len(objects_by_subject[_to_string(row, "subject")])
479+
480+
if n_subjects == 1:
481+
if n_objects == 1:
482+
cards.append("1:1")
483+
else:
484+
cards.append("1:n")
485+
else:
486+
if n_objects == 1:
487+
cards.append("n:1")
488+
else:
489+
cards.append("n:n")
490+
491+
# Add the computed values to the dataframe
492+
self.df[MAPPING_CARDINALITY] = cards
493+
if len(scope) > 0:
494+
self.df[CARDINALITY_SCOPE] = "|".join(scope)
495+
else:
496+
# No scope, so remove any pre-existing "cardinality_scope" column
497+
self.df.drop(columns=CARDINALITY_SCOPE, inplace=True, errors="ignore")
498+
396499

397500
def _standardize_curie_or_iri(curie_or_iri: str, *, converter: Converter) -> str:
398501
"""Standardize a CURIE or IRI, returning the original if not possible.
Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,14 @@
1+
#curie_map:
2+
# COMENT: https://example.com/entities/
3+
# NETENT: https://example.net/entities/
4+
# ORGENT: https://example.org/entities/
5+
# SRC: https://example.org/sources/
6+
#mapping_set_id: https://example.org/sets/cardinality-scope-empty
7+
#license: https://creativecommons.org/licenses/by/4.0/
8+
subject_id subject_label predicate_id object_id object_label mapping_justification object_source mapping_cardinality
9+
ORGENT:0001 alice skos:closeMatch COMENT:0011 alpha semapv:ManualMappingCuration SRC:com 1:n
10+
ORGENT:0001 alice skos:closeMatch NETENT:0111 alpha semapv:ManualMappingCuration SRC:net 1:n
11+
ORGENT:0002 bob skos:closeMatch COMENT:0012 beta semapv:ManualMappingCuration SRC:com 1:n
12+
ORGENT:0002 bob skos:closeMatch NETENT:0112 bravo semapv:ManualMappingCuration SRC:net 1:n
13+
ORGENT:0007 gavin skos:closeMatch NETENT:0117 golf semapv:ManualMappingCuration SRC:net 1:n
14+
ORGENT:0007 gavin skos:exactMatch COMENT:0013 gamma semapv:ManualMappingCuration SRC:com 1:n
Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,11 @@
1+
#curie_map:
2+
# OBJ: https://example.org/object/
3+
# SRC: https://example.org/sources/
4+
# SUBJ: https://example.org/subject/
5+
#mapping_set_id: https://example.org/sets/cardinality-with-unmapped-entities
6+
#license: https://creativecommons.org/licenses/by/4.0/
7+
subject_id predicate_id object_id mapping_justification subject_source object_source mapping_cardinality comment
8+
SUBJ:0001 skos:exactMatch sssom:NoTermFound semapv:ManualMappingCuration SRC:A SRC:B 1:0 S1 in vocabulary A has no exact match in vocabulary B
9+
SUBJ:0001 skos:closeMatch OBJ:0001 semapv:ManualMappingCuration SRC:A SRC:B 1:1 S1 mapped only to O1, O1 mapped only to S1 -- the record involving sssom:NoTermFound does not count, as it is an absence of match rather than an actual mapping
10+
sssom:NoTermFound skos:exactMatch OBJ:0002 semapv:ManualMappingCuration SRC:C SRC:D 0:1 O2 in vocabulary D has no exact match in vocabulary C
11+
sssom:NoTermFound skos:exactMatch sssom:NoTermFound semapv:ManualMappingCuration SRC:E SRC:F 0:0 No exact match between any term from vocabulary E and any term for vocabulary F (in other words, the two vocabularies are completely disjoint, at least as far as exact matches are considered)
Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,8 @@
1+
#curie_map:
2+
# OBJ: https://example.org/object/
3+
# SUBJ: https://example.org/subject/
4+
#mapping_set_id: https://example.org/sets/cardinality-with-literal-mappings
5+
#license: https://creativecommons.org/licenses/by/4.0/
6+
subject_id predicate_id object_id object_label mapping_justification object_type mapping_cardinality comment
7+
SUBJ:0001 skos:exactMatch OBJ:0001 semapv:LexicalMatching 1:n S1 mapped to O1 (entity) and O1 (literal)
8+
SUBJ:0001 skos:exactMatch OBJ:0001 OBJ:0001 semapv:MappingReview rdfs literal 1:n S1 mapped to O1 (entity) and O1 (literal)

tests/data/cardinality.sssom.tsv

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,15 @@
1+
#curie_map:
2+
# OBJ: https://example.org/object/
3+
# SUBJ: https://example.org/subject/
4+
#mapping_set_id: https://example.org/sets/cardinality
5+
#license: https://creativecommons.org/licenses/by/4.0/
6+
subject_id predicate_id object_id mapping_justification mapping_cardinality comment
7+
SUBJ:0001 skos:exactMatch OBJ:0001 semapv:LexicalMatching 1:1 S1 and O1 only mapped to each other
8+
SUBJ:0001 skos:exactMatch OBJ:0001 semapv:MappingReview 1:1 S1 and O1 only mapped to each other
9+
SUBJ:0002 skos:exactMatch OBJ:0002 semapv:LexicalMatching 1:n S2 mapped to both O2 and O3, O2 mapped only to S2
10+
SUBJ:0002 skos:exactMatch OBJ:0003 semapv:LexicalMatching 1:n S2 mapped to both O2 and O3, O3 mapped only to S2
11+
SUBJ:0003 skos:exactMatch OBJ:0004 semapv:LexicalMatching n:1 S3 and S4 both mapped to only O4
12+
SUBJ:0004 skos:exactMatch OBJ:0004 semapv:LexicalMatching n:1 S3 and S4 both mapped to only O4
13+
SUBJ:0005 skos:exactMatch OBJ:0005 semapv:LexicalMatching n:n S5 mapped to O5 and O6, O5 mapped to S5 and S6
14+
SUBJ:0005 skos:exactMatch OBJ:0006 semapv:LexicalMatching 1:n S5 mapped to O5 and O6, O6 mapped only to S5
15+
SUBJ:0006 skos:exactMatch OBJ:0005 semapv:LexicalMatching n:1 S6 mapped only to O5, O5 mapped to both S5 and S6

tests/test_utils.py

Lines changed: 39 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -12,7 +12,9 @@
1212
from sssom_schema import slots as SSSOM_Slots
1313

1414
from sssom.constants import (
15+
CARDINALITY_SCOPE,
1516
CREATOR_ID,
17+
MAPPING_CARDINALITY,
1618
OBJECT_ID,
1719
OBJECT_LABEL,
1820
PREDICATE_ID,
@@ -595,3 +597,40 @@ def test_propagation_fill_empty_mode(self) -> None:
595597
self.assertIn("mapping_tool", propagated_slots)
596598
self.assertNotIn("mapping_tool", msdf.metadata)
597599
self.assertEqual(2, len(msdf.df["mapping_tool"].unique()))
600+
601+
def test_infer_cardinality(self) -> None:
602+
"""Test cardinality computation."""
603+
604+
def _check_against_precomputed_values(filename):
605+
msdf = parse_sssom_table(f"{data_dir}/{filename}")
606+
# Expected values are already contained in the test file
607+
expected = list(msdf.df[MAPPING_CARDINALITY].values)
608+
msdf.df.drop(columns=MAPPING_CARDINALITY, inplace=True)
609+
msdf.infer_cardinality()
610+
self.assertEqual(expected, list(msdf.df[MAPPING_CARDINALITY].values))
611+
612+
_check_against_precomputed_values("cardinality.sssom.tsv")
613+
_check_against_precomputed_values("cardinality-with-NoTermFound.sssom.tsv")
614+
_check_against_precomputed_values("cardinality-with-literal-mappings.sssom.tsv")
615+
616+
def test_infer_scoped_cardinality(self) -> None:
617+
"""Test cardinality computation with scopes."""
618+
msdf = parse_sssom_table(f"{data_dir}/cardinality-scope.sssom.tsv")
619+
620+
msdf.infer_cardinality(["predicate_id"])
621+
expected = ["1:n", "1:n", "1:n", "1:n", "1:1", "1:1"]
622+
self.assertEqual(expected, list(msdf.df[MAPPING_CARDINALITY].values))
623+
624+
msdf.infer_cardinality(["object_source"])
625+
expected = ["1:1", "1:1", "1:1", "1:1", "1:1", "1:1"]
626+
self.assertEqual(expected, list(msdf.df[MAPPING_CARDINALITY].values))
627+
628+
msdf.infer_cardinality(["object_source", "not_a_valid_slot_name"])
629+
# should yield the same result as above
630+
self.assertEqual(expected, list(msdf.df[MAPPING_CARDINALITY].values))
631+
632+
msdf.infer_cardinality(["not_a_valid_slot_name"])
633+
# should be equivalent to an empty scope
634+
expected = ["1:n", "1:n", "1:n", "1:n", "1:n", "1:n"]
635+
self.assertEqual(expected, list(msdf.df[MAPPING_CARDINALITY].values))
636+
self.assertNotIn(CARDINALITY_SCOPE, msdf.df.columns)

0 commit comments

Comments
 (0)