Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 5 additions & 0 deletions src/sssom/constants.py
Original file line number Diff line number Diff line change
Expand Up @@ -89,6 +89,7 @@
MAPPING_SET_SOURCE = "mapping_set_source"
MAPPING_SOURCE = "mapping_source"
MAPPING_CARDINALITY = "mapping_cardinality"
CARDINALITY_SCOPE = "cardinality_scope"
MAPPING_TOOL = "mapping_tool"
MAPPING_TOOL_VERSION = "mapping_tool_version"
MAPPING_DATE = "mapping_date"
Expand All @@ -109,6 +110,10 @@
SUBJECT_SOURCE_ID = "subject_source_id"
OBJECT_SOURCE_ID = "object_source_id"

# Special value for "unmapped" entities
# see <https://mapping-commons.github.io/sssom/spec-model/#representing-unmapped-entities>
NO_TERM_FOUND = "sssom:NoTermFound"

# PREDICATES
OWL_EQUIVALENT_CLASS = "owl:equivalentClass"
OWL_EQUIVALENT_PROPERTY = "owl:equivalentProperty"
Expand Down
94 changes: 94 additions & 0 deletions src/sssom/util.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,12 +23,15 @@
from sssom_schema import MappingSet, slots

from .constants import (
CARDINALITY_SCOPE,
COLUMN_INVERT_DICTIONARY,
COMMENT,
CONFIDENCE,
MAPPING_CARDINALITY,
MAPPING_JUSTIFICATION,
MAPPING_SET_ID,
MAPPING_SET_SOURCE,
NO_TERM_FOUND,
OBJECT_CATEGORY,
OBJECT_ID,
OBJECT_LABEL,
Expand Down Expand Up @@ -393,6 +396,97 @@ def condense(self) -> List[str]:
self.df.drop(columns=condensed, inplace=True)
return condensed

def infer_cardinality(self, scope: List[str] = None) -> None:
"""Infer cardinality values in the set.

This method will automatically fill the `mapping_cardinality` slot for
all records in the set, overwriting any pre-existing values.

See <https://mapping-commons.github.io/sssom/spec-model/#mapping-cardinality-and-cardinality-scope>
for more information about cardinality computation,
<https://mapping-commons.github.io/sssom/spec-model/#literal-mappings>
for how to deal with literal mapping records, and
<https://mapping-commons.github.io/sssom/spec-model/#representing-unmapped-entities>
for how to deal with mapping records involving `sssom:NoTermFound`.

:param scope: A list of slot names that defines the subset of the
records in which cardinality will be computed. For
example, with a scope of `['predicate_id']`, for any
given record the cardinality will be computed relatively
to the subset of records that have the same predicate.
The default is an empty list, meaning that cardinality is
computed relatively to the entire set of records.
"""
if scope is None:
scope = []
subjects_by_object: dict[str, set[str]] = {} # Unique subjects for any given object
objects_by_subject: dict[str, set[str]] = {} # Unique objects for any given subject

# Helper function to transform a row into a string that represents
# a subject (or object) in a given scope; `side` is either `subject`
# or `object`.
def _to_string(row, side):
# We prepend a one-letter code (`L` or `E`) to the actual subject
# or object so that literal and non-literal mapping records are
# always distinguishable and can be counted separately.
if row.get(f"{side}_type") == "rdfs literal":
s = "L\0" + row.get(f"{side}_label", "")
else:
s = "E\0" + row.get(f"{side}_id", "")
for slot in scope:
s += "\0" + row.get(slot, "")
return s

# We iterate over the records a first time to collect the different
# objects mapped to each subject and vice versa
for _, row in self.df.iterrows():
if row.get(SUBJECT_ID) == NO_TERM_FOUND or row.get(OBJECT_ID) == NO_TERM_FOUND:
# Mappings to sssom:NoTermFound are ignored for cardinality computations
continue

subj = _to_string(row, "subject")
obj = _to_string(row, "object")

subjects_by_object.setdefault(obj, set()).add(subj)
objects_by_subject.setdefault(subj, set()).add(obj)

# Second iteration to compute the actual cardinality values. Since we
# must not modify a row while we are iterating over the dataframe, we
# collect the values in a separate array.
cards = []
for _, row in self.df.iterrows():
# Special cases involving sssom:NoTermFound on either side
if row.get(SUBJECT_ID) == NO_TERM_FOUND:
if row.get(OBJECT_ID) == NO_TERM_FOUND:
cards.append("0:0")
else:
cards.append("0:1")
elif row.get(OBJECT_ID) == NO_TERM_FOUND:
cards.append("1:0")
else:
# General case
n_subjects = len(subjects_by_object[_to_string(row, "object")])
n_objects = len(objects_by_subject[_to_string(row, "subject")])

if n_subjects == 1:
if n_objects == 1:
cards.append("1:1")
else:
cards.append("1:n")
else:
if n_objects == 1:
cards.append("n:1")
else:
cards.append("n:n")

# Add the computed values to the dataframe
self.df[MAPPING_CARDINALITY] = cards
if len(scope) > 0:
self.df[CARDINALITY_SCOPE] = "|".join(scope)
else:
# No scope, so remove any pre-existing "cardinality_scope" column
self.df.drop(columns=CARDINALITY_SCOPE, inplace=True, errors="ignore")


def _standardize_curie_or_iri(curie_or_iri: str, *, converter: Converter) -> str:
"""Standardize a CURIE or IRI, returning the original if not possible.
Expand Down
14 changes: 14 additions & 0 deletions tests/data/cardinality-scope.sssom.tsv
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
#curie_map:
# COMENT: https://example.com/entities/
# NETENT: https://example.net/entities/
# ORGENT: https://example.org/entities/
# SRC: https://example.org/sources/
#mapping_set_id: https://example.org/sets/cardinality-scope-empty
#license: https://creativecommons.org/licenses/by/4.0/
subject_id subject_label predicate_id object_id object_label mapping_justification object_source mapping_cardinality
ORGENT:0001 alice skos:closeMatch COMENT:0011 alpha semapv:ManualMappingCuration SRC:com 1:n
ORGENT:0001 alice skos:closeMatch NETENT:0111 alpha semapv:ManualMappingCuration SRC:net 1:n
ORGENT:0002 bob skos:closeMatch COMENT:0012 beta semapv:ManualMappingCuration SRC:com 1:n
ORGENT:0002 bob skos:closeMatch NETENT:0112 bravo semapv:ManualMappingCuration SRC:net 1:n
ORGENT:0007 gavin skos:closeMatch NETENT:0117 golf semapv:ManualMappingCuration SRC:net 1:n
ORGENT:0007 gavin skos:exactMatch COMENT:0013 gamma semapv:ManualMappingCuration SRC:com 1:n
11 changes: 11 additions & 0 deletions tests/data/cardinality-with-NoTermFound.sssom.tsv
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
#curie_map:
# OBJ: https://example.org/object/
# SRC: https://example.org/sources/
# SUBJ: https://example.org/subject/
#mapping_set_id: https://example.org/sets/cardinality-with-unmapped-entities
#license: https://creativecommons.org/licenses/by/4.0/
subject_id predicate_id object_id mapping_justification subject_source object_source mapping_cardinality comment
SUBJ:0001 skos:exactMatch sssom:NoTermFound semapv:ManualMappingCuration SRC:A SRC:B 1:0 S1 in vocabulary A has no exact match in vocabulary B
SUBJ:0001 skos:closeMatch OBJ:0001 semapv:ManualMappingCuration SRC:A SRC:B 1:1 S1 mapped only to O1, O1 mapped only to S1 -- the record involving sssom:NoTermFound does not count, as it is an absence of match rather than an actual mapping
sssom:NoTermFound skos:exactMatch OBJ:0002 semapv:ManualMappingCuration SRC:C SRC:D 0:1 O2 in vocabulary D has no exact match in vocabulary C
sssom:NoTermFound skos:exactMatch sssom:NoTermFound semapv:ManualMappingCuration SRC:E SRC:F 0:0 No exact match between any term from vocabulary E and any term for vocabulary F (in other words, the two vocabularies are completely disjoint, at least as far as exact matches are considered)
8 changes: 8 additions & 0 deletions tests/data/cardinality-with-literal-mappings.sssom.tsv
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
#curie_map:
# OBJ: https://example.org/object/
# SUBJ: https://example.org/subject/
#mapping_set_id: https://example.org/sets/cardinality-with-literal-mappings
#license: https://creativecommons.org/licenses/by/4.0/
subject_id predicate_id object_id object_label mapping_justification object_type mapping_cardinality comment
SUBJ:0001 skos:exactMatch OBJ:0001 semapv:LexicalMatching 1:n S1 mapped to O1 (entity) and O1 (literal)
SUBJ:0001 skos:exactMatch OBJ:0001 OBJ:0001 semapv:MappingReview rdfs literal 1:n S1 mapped to O1 (entity) and O1 (literal)
15 changes: 15 additions & 0 deletions tests/data/cardinality.sssom.tsv
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
#curie_map:
# OBJ: https://example.org/object/
# SUBJ: https://example.org/subject/
#mapping_set_id: https://example.org/sets/cardinality
#license: https://creativecommons.org/licenses/by/4.0/
subject_id predicate_id object_id mapping_justification mapping_cardinality comment
SUBJ:0001 skos:exactMatch OBJ:0001 semapv:LexicalMatching 1:1 S1 and O1 only mapped to each other
SUBJ:0001 skos:exactMatch OBJ:0001 semapv:MappingReview 1:1 S1 and O1 only mapped to each other
SUBJ:0002 skos:exactMatch OBJ:0002 semapv:LexicalMatching 1:n S2 mapped to both O2 and O3, O2 mapped only to S2
SUBJ:0002 skos:exactMatch OBJ:0003 semapv:LexicalMatching 1:n S2 mapped to both O2 and O3, O3 mapped only to S2
SUBJ:0003 skos:exactMatch OBJ:0004 semapv:LexicalMatching n:1 S3 and S4 both mapped to only O4
SUBJ:0004 skos:exactMatch OBJ:0004 semapv:LexicalMatching n:1 S3 and S4 both mapped to only O4
SUBJ:0005 skos:exactMatch OBJ:0005 semapv:LexicalMatching n:n S5 mapped to O5 and O6, O5 mapped to S5 and S6
SUBJ:0005 skos:exactMatch OBJ:0006 semapv:LexicalMatching 1:n S5 mapped to O5 and O6, O6 mapped only to S5
SUBJ:0006 skos:exactMatch OBJ:0005 semapv:LexicalMatching n:1 S6 mapped only to O5, O5 mapped to both S5 and S6
28 changes: 28 additions & 0 deletions tests/test_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@

from sssom.constants import (
CREATOR_ID,
MAPPING_CARDINALITY,
OBJECT_ID,
OBJECT_LABEL,
PREDICATE_ID,
Expand Down Expand Up @@ -595,3 +596,30 @@ def test_propagation_fill_empty_mode(self) -> None:
self.assertIn("mapping_tool", propagated_slots)
self.assertNotIn("mapping_tool", msdf.metadata)
self.assertEqual(2, len(msdf.df["mapping_tool"].unique()))

def test_infer_cardinality(self) -> None:
"""Test cardinality computation."""

def _check_against_precomputed_values(filename):
msdf = parse_sssom_table(f"{data_dir}/{filename}")
# Expected values are already contained in the test file
expected = list(msdf.df[MAPPING_CARDINALITY].values)
msdf.df.drop(columns=MAPPING_CARDINALITY, inplace=True)
msdf.infer_cardinality()
self.assertEqual(expected, list(msdf.df[MAPPING_CARDINALITY].values))

_check_against_precomputed_values("cardinality.sssom.tsv")
_check_against_precomputed_values("cardinality-with-NoTermFound.sssom.tsv")
_check_against_precomputed_values("cardinality-with-literal-mappings.sssom.tsv")

def test_infer_scoped_cardinality(self) -> None:
"""Test cardinality computation with scopes."""
msdf = parse_sssom_table(f"{data_dir}/cardinality-scope.sssom.tsv")

msdf.infer_cardinality(["predicate_id"])
expected = ["1:n", "1:n", "1:n", "1:n", "1:1", "1:1"]
self.assertEqual(expected, list(msdf.df[MAPPING_CARDINALITY].values))

msdf.infer_cardinality(["object_source"])
expected = ["1:1", "1:1", "1:1", "1:1", "1:1", "1:1"]
self.assertEqual(expected, list(msdf.df[MAPPING_CARDINALITY].values))
Loading