Skip to content
Open
Show file tree
Hide file tree
Changes from 2 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
94 changes: 94 additions & 0 deletions src/sssom/util.py
Original file line number Diff line number Diff line change
Expand Up @@ -393,6 +393,100 @@ def condense(self) -> List[str]:
self.df.drop(columns=condensed, inplace=True)
return condensed

def infer_cardinality(self, scope: List[str] = None) -> None:
"""Infer cardinality values in the set.
This method will automatically fill the `mapping_cardinality` slot for
all records in the set, overwriting any pre-existing values.
See <https://mapping-commons.github.io/sssom/spec-model/#mapping-cardinality-and-cardinality-scope>
for more information about cardinality computation,
<https://mapping-commons.github.io/sssom/spec-model/#literal-mappings>
for how to deal with literal mapping records, and
<https://mapping-commons.github.io/sssom/spec-model/#representing-unmapped-entities>
for how to deal with mapping records involving `sssom:NoTermFound`.
:param scope: A list of slot names that defines the subset of the
records in which cardinality will be computed. For
example, with a scope of `['predicate_id']`, for any
given record the cardinality will be computed relatively
to the subset of records that have the same predicate.
The default is an empty list, meaning that cardinality is
computed relatively to the entire set of records.
"""
if scope is None:
scope = []
subjects_by_object: dict[str, set[str]] = {} # Unique subjects for any given object
objects_by_subject: dict[str, set[str]] = {} # Unique objects for any given subject

# Helper function to transform a row into a string that represents
# a subject (or object) in a given scope; `side` is either `subject`
# or `object`.
def _to_string(row, side):
# We prepend a one-letter code (`L` or `E`) to the actual subject
# or object so that literal and non-literal mapping records are
# always distinguishable and can be counted separately.
if row.get(f"{side}_type") == "rdfs literal":
s = "L\0" + row.get(f"{side}_label", "")
else:
s = "E\0" + row.get(f"{side}_id", "")
for slot in scope:
s += "\0" + row.get(slot, "")
return s

# We iterate over the records a first time to collect the different
# objects mapped to each subject and vice versa
for _, row in self.df.iterrows():
if (
row.get("subject_id") == "sssom:NoTermFound"
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

can you use constants for the fields and sssom:NoTermFound? I think many are already in the sssom.constants module

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Not frankly convinced it brings any real benefit (it’s not as if those values could change), but OK with that, if only for consistency with the rest of the code. 👍

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

(Aside: is the sssom.constants module entirely hand-written? Seems to me that most of those constant declarations could, and arguably should, be generated from the LinkML schema…)

or row.get("object_id") == "sssom:NoTermFound"
):
# Mappings to sssom:NoTermFound are ignored for cardinality computations
continue

subj = _to_string(row, "subject")
obj = _to_string(row, "object")

subjects_by_object.setdefault(obj, set()).add(subj)
objects_by_subject.setdefault(subj, set()).add(obj)

# Second iteration to compute the actual cardinality values. Since we
# must not modify a row while we are iterating over the dataframe, we
# collect the values in a separate array.
cards = []
for _, row in self.df.iterrows():
# Special cases involving sssom:NoTermFound on either side
if row.get("subject_id") == "sssom:NoTermFound":
if row.get("object_id") == "sssom:NoTermFound":
cards.append("0:0")
else:
cards.append("0:1")
elif row.get("object_id") == "sssom:NoTermFound":
cards.append("1:0")
else:
# General case
n_subjects = len(subjects_by_object[_to_string(row, "object")])
n_objects = len(objects_by_subject[_to_string(row, "subject")])

if n_subjects == 1:
if n_objects == 1:
cards.append("1:1")
else:
cards.append("1:n")
else:
if n_objects == 1:
cards.append("n:1")
else:
cards.append("n:n")

# Add the computed values to the dataframe
self.df["mapping_cardinality"] = cards
if len(scope) > 0:
self.df["cardinality_scope"] = "|".join(scope)
else:
# No scope, so remove any pre-existing "cardinality_scope" column
self.df.drop(columns="cardinality_scope", inplace=True, errors="ignore")


def _standardize_curie_or_iri(curie_or_iri: str, *, converter: Converter) -> str:
"""Standardize a CURIE or IRI, returning the original if not possible.
Expand Down
14 changes: 14 additions & 0 deletions tests/data/cardinality-scope.sssom.tsv
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
#curie_map:
# COMENT: https://example.com/entities/
# NETENT: https://example.net/entities/
# ORGENT: https://example.org/entities/
# SRC: https://example.org/sources/
#mapping_set_id: https://example.org/sets/cardinality-scope-empty
#license: https://creativecommons.org/licenses/by/4.0/
subject_id subject_label predicate_id object_id object_label mapping_justification object_source mapping_cardinality
ORGENT:0001 alice skos:closeMatch COMENT:0011 alpha semapv:ManualMappingCuration SRC:com 1:n
ORGENT:0001 alice skos:closeMatch NETENT:0111 alpha semapv:ManualMappingCuration SRC:net 1:n
ORGENT:0002 bob skos:closeMatch COMENT:0012 beta semapv:ManualMappingCuration SRC:com 1:n
ORGENT:0002 bob skos:closeMatch NETENT:0112 bravo semapv:ManualMappingCuration SRC:net 1:n
ORGENT:0007 gavin skos:closeMatch NETENT:0117 golf semapv:ManualMappingCuration SRC:net 1:n
ORGENT:0007 gavin skos:exactMatch COMENT:0013 gamma semapv:ManualMappingCuration SRC:com 1:n
11 changes: 11 additions & 0 deletions tests/data/cardinality-with-NoTermFound.sssom.tsv
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
#curie_map:
# OBJ: https://example.org/object/
# SRC: https://example.org/sources/
# SUBJ: https://example.org/subject/
#mapping_set_id: https://example.org/sets/cardinality-with-unmapped-entities
#license: https://creativecommons.org/licenses/by/4.0/
subject_id predicate_id object_id mapping_justification subject_source object_source mapping_cardinality comment
SUBJ:0001 skos:exactMatch sssom:NoTermFound semapv:ManualMappingCuration SRC:A SRC:B 1:0 S1 in vocabulary A has no exact match in vocabulary B
SUBJ:0001 skos:closeMatch OBJ:0001 semapv:ManualMappingCuration SRC:A SRC:B 1:1 S1 mapped only to O1, O1 mapped only to S1 -- the record involving sssom:NoTermFound does not count, as it is an absence of match rather than an actual mapping
sssom:NoTermFound skos:exactMatch OBJ:0002 semapv:ManualMappingCuration SRC:C SRC:D 0:1 O2 in vocabulary D has no exact match in vocabulary C
sssom:NoTermFound skos:exactMatch sssom:NoTermFound semapv:ManualMappingCuration SRC:E SRC:F 0:0 No exact match between any term from vocabulary E and any term for vocabulary F (in other words, the two vocabularies are completely disjoint, at least as far as exact matches are considered)
8 changes: 8 additions & 0 deletions tests/data/cardinality-with-literal-mappings.sssom.tsv
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
#curie_map:
# OBJ: https://example.org/object/
# SUBJ: https://example.org/subject/
#mapping_set_id: https://example.org/sets/cardinality-with-literal-mappings
#license: https://creativecommons.org/licenses/by/4.0/
subject_id predicate_id object_id object_label mapping_justification object_type mapping_cardinality comment
SUBJ:0001 skos:exactMatch OBJ:0001 semapv:LexicalMatching 1:n S1 mapped to O1 (entity) and O1 (literal)
SUBJ:0001 skos:exactMatch OBJ:0001 OBJ:0001 semapv:MappingReview rdfs literal 1:n S1 mapped to O1 (entity) and O1 (literal)
15 changes: 15 additions & 0 deletions tests/data/cardinality.sssom.tsv
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
#curie_map:
# OBJ: https://example.org/object/
# SUBJ: https://example.org/subject/
#mapping_set_id: https://example.org/sets/cardinality
#license: https://creativecommons.org/licenses/by/4.0/
subject_id predicate_id object_id mapping_justification mapping_cardinality comment
SUBJ:0001 skos:exactMatch OBJ:0001 semapv:LexicalMatching 1:1 S1 and O1 only mapped to each other
SUBJ:0001 skos:exactMatch OBJ:0001 semapv:MappingReview 1:1 S1 and O1 only mapped to each other
SUBJ:0002 skos:exactMatch OBJ:0002 semapv:LexicalMatching 1:n S2 mapped to both O2 and O3, O2 mapped only to S2
SUBJ:0002 skos:exactMatch OBJ:0003 semapv:LexicalMatching 1:n S2 mapped to both O2 and O3, O3 mapped only to S2
SUBJ:0003 skos:exactMatch OBJ:0004 semapv:LexicalMatching n:1 S3 and S4 both mapped to only O4
SUBJ:0004 skos:exactMatch OBJ:0004 semapv:LexicalMatching n:1 S3 and S4 both mapped to only O4
SUBJ:0005 skos:exactMatch OBJ:0005 semapv:LexicalMatching n:n S5 mapped to O5 and O6, O5 mapped to S5 and S6
SUBJ:0005 skos:exactMatch OBJ:0006 semapv:LexicalMatching 1:n S5 mapped to O5 and O6, O6 mapped only to S5
SUBJ:0006 skos:exactMatch OBJ:0005 semapv:LexicalMatching n:1 S6 mapped only to O5, O5 mapped to both S5 and S6
27 changes: 27 additions & 0 deletions tests/test_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -595,3 +595,30 @@ def test_propagation_fill_empty_mode(self) -> None:
self.assertIn("mapping_tool", propagated_slots)
self.assertNotIn("mapping_tool", msdf.metadata)
self.assertEqual(2, len(msdf.df["mapping_tool"].unique()))

def test_infer_cardinality(self) -> None:
"""Test cardinality computation."""

def _check_against_precomputed_values(filename):
msdf = parse_sssom_table(f"{data_dir}/{filename}")
# Expected values are already contained in the test file
expected = list(msdf.df["mapping_cardinality"].values)
msdf.df.drop(columns="mapping_cardinality", inplace=True)
msdf.infer_cardinality()
self.assertEqual(expected, list(msdf.df["mapping_cardinality"].values))

_check_against_precomputed_values("cardinality.sssom.tsv")
_check_against_precomputed_values("cardinality-with-NoTermFound.sssom.tsv")
_check_against_precomputed_values("cardinality-with-literal-mappings.sssom.tsv")

def test_infer_scoped_cardinality(self) -> None:
"""Test cardinality computation with scopes."""
msdf = parse_sssom_table(f"{data_dir}/cardinality-scope.sssom.tsv")

msdf.infer_cardinality(["predicate_id"])
expected = ["1:n", "1:n", "1:n", "1:n", "1:1", "1:1"]
self.assertEqual(expected, list(msdf.df["mapping_cardinality"].values))

msdf.infer_cardinality(["object_source"])
expected = ["1:1", "1:1", "1:1", "1:1", "1:1", "1:1"]
self.assertEqual(expected, list(msdf.df["mapping_cardinality"].values))
Loading