Add the infer_cardinality method. (#605)

gouttegd · cthoyt · web-flow · commit 0c885365f6ee · 2025-09-09T13:46:48.000+01:00
Add a new `infer_cardinality` method to the `MappingSetDataFrame` to
fill the `mapping_cardinality` slot with computed cardinality values.

The approach used here is more or less a direct Python translation of my
existing implementation in SSSOM-Java.

The gist of it is that we iterate over the entire set of records a first
time to populate two hash tables: one that associates a subject to all
the different objects it is mapped to, and one that associates an object
to all the different subjects it is mapped to. Then we can iterate over
the records a second time, and for every record we can immediately get
(1) the number of different objects mapped to the same subject and (2)
the number of different subjects mapped to the same object; the
combination of those two values gives us the cardinality we are looking
for.

To deal with the concept of "scope", the "subjects" and "objects" that
we use to fill the hash tables are not made of only the "subject_id"
slot or the "object_id" slot, but also of all the slots that define the
scope. For example, if the scope is `["predicate_id"]`, then for the
following record:

  subject_id   predicate_id      object_id
  DO:1234      skos:exactMatch   HP:5678

the "subject" string will contain both `DO:1234` and `skos:exactMatch`,
and the "object" string will contain both `HP:5678` and
`skos:exactMatch`.

Co-authored-by: Charles Tapley Hoyt &lt;cthoyt@gmail.com&gt;
diff --git a/src/sssom/constants.py b/src/sssom/constants.py
@@ -89,6 +89,7 @@
 MAPPING_SET_SOURCE = "mapping_set_source"
 MAPPING_SOURCE = "mapping_source"
 MAPPING_CARDINALITY = "mapping_cardinality"
+CARDINALITY_SCOPE = "cardinality_scope"
 MAPPING_TOOL = "mapping_tool"
 MAPPING_TOOL_VERSION = "mapping_tool_version"
 MAPPING_DATE = "mapping_date"
@@ -109,6 +110,10 @@
 SUBJECT_SOURCE_ID = "subject_source_id"
 OBJECT_SOURCE_ID = "object_source_id"
 
+# Special value for "unmapped" entities
+# see <https://mapping-commons.github.io/sssom/spec-model/#representing-unmapped-entities>
+NO_TERM_FOUND = "sssom:NoTermFound"
+
 # PREDICATES
 OWL_EQUIVALENT_CLASS = "owl:equivalentClass"
 OWL_EQUIVALENT_PROPERTY = "owl:equivalentProperty"
diff --git a/src/sssom/util.py b/src/sssom/util.py
@@ -23,12 +23,15 @@
 from sssom_schema import MappingSet, slots
 
 from .constants import (
+    CARDINALITY_SCOPE,
     COLUMN_INVERT_DICTIONARY,
     COMMENT,
     CONFIDENCE,
+    MAPPING_CARDINALITY,
     MAPPING_JUSTIFICATION,
     MAPPING_SET_ID,
     MAPPING_SET_SOURCE,
+    NO_TERM_FOUND,
     OBJECT_CATEGORY,
     OBJECT_ID,
     OBJECT_LABEL,
@@ -393,6 +396,106 @@ def condense(self) -> List[str]:
         self.df.drop(columns=condensed, inplace=True)
         return condensed
 
+    def infer_cardinality(self, scope: Optional[List[str]] = None) -> None:
+        """Infer cardinality values in the set.
+
+        This method will automatically fill the `mapping_cardinality` slot for
+        all records in the set, overwriting any pre-existing values.
+
+        See <https://mapping-commons.github.io/sssom/spec-model/#mapping-cardinality-and-cardinality-scope>
+        for more information about cardinality computation,
+        <https://mapping-commons.github.io/sssom/spec-model/#literal-mappings>
+        for how to deal with literal mapping records, and
+        <https://mapping-commons.github.io/sssom/spec-model/#representing-unmapped-entities>
+        for how to deal with mapping records involving `sssom:NoTermFound`.
+
+        :param scope: A list of slot names that defines the subset of the
+                      records in which cardinality will be computed. For
+                      example, with a scope of `['predicate_id']`, for any
+                      given record the cardinality will be computed relatively
+                      to the subset of records that have the same predicate.
+                      The default is an empty list, meaning that cardinality is
+                      computed relatively to the entire set of records.
+        """
+        if scope is None:
+            scope = []
+
+        #: Unique subjects for any given object
+        subjects_by_object: defaultdict[str, set[str]] = defaultdict(set)
+        #: Unique objects for any given subject
+        objects_by_subject: defaultdict[str, set[str]] = defaultdict(set)
+
+        schema = SSSOMSchemaView()
+        unknown_slots = [slot for slot in scope if slot not in schema.mapping_slots]
+        if len(unknown_slots) > 0:
+            logging.warning(f"Ignoring invalid slot name(s): {unknown_slots}.")
+            scope = list(set(scope) - set(unknown_slots))
+
+        # Helper function to transform a row into a string that represents
+        # a subject (or object) in a given scope; `side` is either `subject`
+        # or `object`.
+        def _to_string(row: dict[str, Any], side: str) -> str:
+            # We prepend a one-letter code (`L` or `E`) to the actual subject
+            # or object so that literal and non-literal mapping records are
+            # always distinguishable and can be counted separately.
+            if row.get(f"{side}_type") == "rdfs literal":
+                s = "L\0" + row.get(f"{side}_label", "")
+            else:
+                s = "E\0" + row.get(f"{side}_id", "")
+            for slot in scope:
+                s += "\0" + row.get(slot, "")
+            return s
+
+        # We iterate over the records a first time to collect the different
+        # objects mapped to each subject and vice versa
+        for _, row in self.df.iterrows():
+            if row.get(SUBJECT_ID) == NO_TERM_FOUND or row.get(OBJECT_ID) == NO_TERM_FOUND:
+                # Mappings to sssom:NoTermFound are ignored for cardinality computations
+                continue
+
+            subj = _to_string(row, "subject")
+            obj = _to_string(row, "object")
+
+            subjects_by_object[obj].add(subj)
+            objects_by_subject[subj].add(obj)
+
+        # Second iteration to compute the actual cardinality values. Since we
+        # must not modify a row while we are iterating over the dataframe, we
+        # collect the values in a separate array.
+        cards = []
+        for _, row in self.df.iterrows():
+            # Special cases involving sssom:NoTermFound on either side
+            if row.get(SUBJECT_ID) == NO_TERM_FOUND:
+                if row.get(OBJECT_ID) == NO_TERM_FOUND:
+                    cards.append("0:0")
+                else:
+                    cards.append("0:1")
+            elif row.get(OBJECT_ID) == NO_TERM_FOUND:
+                cards.append("1:0")
+            else:
+                # General case
+                n_subjects = len(subjects_by_object[_to_string(row, "object")])
+                n_objects = len(objects_by_subject[_to_string(row, "subject")])
+
+                if n_subjects == 1:
+                    if n_objects == 1:
+                        cards.append("1:1")
+                    else:
+                        cards.append("1:n")
+                else:
+                    if n_objects == 1:
+                        cards.append("n:1")
+                    else:
+                        cards.append("n:n")
+
+        # Add the computed values to the dataframe
+        self.df[MAPPING_CARDINALITY] = cards
+        if len(scope) > 0:
+            self.df[CARDINALITY_SCOPE] = "|".join(scope)
+        else:
+            # No scope, so remove any pre-existing "cardinality_scope" column
+            self.df.drop(columns=CARDINALITY_SCOPE, inplace=True, errors="ignore")
+
 
 def _standardize_curie_or_iri(curie_or_iri: str, *, converter: Converter) -> str:
     """Standardize a CURIE or IRI, returning the original if not possible.
diff --git a/tests/data/cardinality-scope.sssom.tsv b/tests/data/cardinality-scope.sssom.tsv
@@ -0,0 +1,14 @@
+#curie_map:
+#  COMENT: https://example.com/entities/
+#  NETENT: https://example.net/entities/
+#  ORGENT: https://example.org/entities/
+#  SRC: https://example.org/sources/
+#mapping_set_id: https://example.org/sets/cardinality-scope-empty
+#license: https://creativecommons.org/licenses/by/4.0/
+subject_id	subject_label	predicate_id	object_id	object_label	mapping_justification	object_source	mapping_cardinality
+ORGENT:0001	alice	skos:closeMatch	COMENT:0011	alpha	semapv:ManualMappingCuration	SRC:com	1:n
+ORGENT:0001	alice	skos:closeMatch	NETENT:0111	alpha	semapv:ManualMappingCuration	SRC:net	1:n
+ORGENT:0002	bob	skos:closeMatch	COMENT:0012	beta	semapv:ManualMappingCuration	SRC:com	1:n
+ORGENT:0002	bob	skos:closeMatch	NETENT:0112	bravo	semapv:ManualMappingCuration	SRC:net	1:n
+ORGENT:0007	gavin	skos:closeMatch	NETENT:0117	golf	semapv:ManualMappingCuration	SRC:net	1:n
+ORGENT:0007	gavin	skos:exactMatch	COMENT:0013	gamma	semapv:ManualMappingCuration	SRC:com	1:n
diff --git a/tests/data/cardinality-with-NoTermFound.sssom.tsv b/tests/data/cardinality-with-NoTermFound.sssom.tsv
@@ -0,0 +1,11 @@
+#curie_map:
+#  OBJ: https://example.org/object/
+#  SRC: https://example.org/sources/
+#  SUBJ: https://example.org/subject/
+#mapping_set_id: https://example.org/sets/cardinality-with-unmapped-entities
+#license: https://creativecommons.org/licenses/by/4.0/
+subject_id	predicate_id	object_id	mapping_justification	subject_source	object_source	mapping_cardinality	comment
+SUBJ:0001	skos:exactMatch	sssom:NoTermFound	semapv:ManualMappingCuration	SRC:A	SRC:B	1:0	S1 in vocabulary A has no exact match in vocabulary B
+SUBJ:0001	skos:closeMatch	OBJ:0001	semapv:ManualMappingCuration	SRC:A	SRC:B	1:1	S1 mapped only to O1, O1 mapped only to S1 -- the record involving sssom:NoTermFound does not count, as it is an absence of match rather than an actual mapping
+sssom:NoTermFound	skos:exactMatch	OBJ:0002	semapv:ManualMappingCuration	SRC:C	SRC:D	0:1	O2 in vocabulary D has no exact match in vocabulary C
+sssom:NoTermFound	skos:exactMatch	sssom:NoTermFound	semapv:ManualMappingCuration	SRC:E	SRC:F	0:0	No exact match between any term from vocabulary E and any term for vocabulary F (in other words, the two vocabularies are completely disjoint, at least as far as exact matches are considered)
diff --git a/tests/data/cardinality-with-literal-mappings.sssom.tsv b/tests/data/cardinality-with-literal-mappings.sssom.tsv
@@ -0,0 +1,8 @@
+#curie_map:
+#  OBJ: https://example.org/object/
+#  SUBJ: https://example.org/subject/
+#mapping_set_id: https://example.org/sets/cardinality-with-literal-mappings
+#license: https://creativecommons.org/licenses/by/4.0/
+subject_id	predicate_id	object_id	object_label	mapping_justification	object_type	mapping_cardinality	comment
+SUBJ:0001	skos:exactMatch	OBJ:0001		semapv:LexicalMatching		1:n	S1 mapped to O1 (entity) and O1 (literal)
+SUBJ:0001	skos:exactMatch	OBJ:0001	OBJ:0001	semapv:MappingReview	rdfs literal	1:n	S1 mapped to O1 (entity) and O1 (literal)
diff --git a/tests/data/cardinality.sssom.tsv b/tests/data/cardinality.sssom.tsv
@@ -0,0 +1,15 @@
+#curie_map:
+#  OBJ: https://example.org/object/
+#  SUBJ: https://example.org/subject/
+#mapping_set_id: https://example.org/sets/cardinality
+#license: https://creativecommons.org/licenses/by/4.0/
+subject_id	predicate_id	object_id	mapping_justification	mapping_cardinality	comment
+SUBJ:0001	skos:exactMatch	OBJ:0001	semapv:LexicalMatching	1:1	S1 and O1 only mapped to each other
+SUBJ:0001	skos:exactMatch	OBJ:0001	semapv:MappingReview	1:1	S1 and O1 only mapped to each other
+SUBJ:0002	skos:exactMatch	OBJ:0002	semapv:LexicalMatching	1:n	S2 mapped to both O2 and O3, O2 mapped only to S2
+SUBJ:0002	skos:exactMatch	OBJ:0003	semapv:LexicalMatching	1:n	S2 mapped to both O2 and O3, O3 mapped only to S2
+SUBJ:0003	skos:exactMatch	OBJ:0004	semapv:LexicalMatching	n:1	S3 and S4 both mapped to only O4
+SUBJ:0004	skos:exactMatch	OBJ:0004	semapv:LexicalMatching	n:1	S3 and S4 both mapped to only O4
+SUBJ:0005	skos:exactMatch	OBJ:0005	semapv:LexicalMatching	n:n	S5 mapped to O5 and O6, O5 mapped to S5 and S6
+SUBJ:0005	skos:exactMatch	OBJ:0006	semapv:LexicalMatching	1:n	S5 mapped to O5 and O6, O6 mapped only to S5
+SUBJ:0006	skos:exactMatch	OBJ:0005	semapv:LexicalMatching	n:1	S6 mapped only to O5, O5 mapped to both S5 and S6
diff --git a/tests/test_utils.py b/tests/test_utils.py
@@ -12,7 +12,9 @@
 from sssom_schema import slots as SSSOM_Slots
 
 from sssom.constants import (
+    CARDINALITY_SCOPE,
     CREATOR_ID,
+    MAPPING_CARDINALITY,
     OBJECT_ID,
     OBJECT_LABEL,
     PREDICATE_ID,
@@ -595,3 +597,40 @@ def test_propagation_fill_empty_mode(self) -> None:
         self.assertIn("mapping_tool", propagated_slots)
         self.assertNotIn("mapping_tool", msdf.metadata)
         self.assertEqual(2, len(msdf.df["mapping_tool"].unique()))
+
+    def test_infer_cardinality(self) -> None:
+        """Test cardinality computation."""
+
+        def _check_against_precomputed_values(filename):
+            msdf = parse_sssom_table(f"{data_dir}/{filename}")
+            # Expected values are already contained in the test file
+            expected = list(msdf.df[MAPPING_CARDINALITY].values)
+            msdf.df.drop(columns=MAPPING_CARDINALITY, inplace=True)
+            msdf.infer_cardinality()
+            self.assertEqual(expected, list(msdf.df[MAPPING_CARDINALITY].values))
+
+        _check_against_precomputed_values("cardinality.sssom.tsv")
+        _check_against_precomputed_values("cardinality-with-NoTermFound.sssom.tsv")
+        _check_against_precomputed_values("cardinality-with-literal-mappings.sssom.tsv")
+
+    def test_infer_scoped_cardinality(self) -> None:
+        """Test cardinality computation with scopes."""
+        msdf = parse_sssom_table(f"{data_dir}/cardinality-scope.sssom.tsv")
+
+        msdf.infer_cardinality(["predicate_id"])
+        expected = ["1:n", "1:n", "1:n", "1:n", "1:1", "1:1"]
+        self.assertEqual(expected, list(msdf.df[MAPPING_CARDINALITY].values))
+
+        msdf.infer_cardinality(["object_source"])
+        expected = ["1:1", "1:1", "1:1", "1:1", "1:1", "1:1"]
+        self.assertEqual(expected, list(msdf.df[MAPPING_CARDINALITY].values))
+
+        msdf.infer_cardinality(["object_source", "not_a_valid_slot_name"])
+        # should yield the same result as above
+        self.assertEqual(expected, list(msdf.df[MAPPING_CARDINALITY].values))
+
+        msdf.infer_cardinality(["not_a_valid_slot_name"])
+        # should be equivalent to an empty scope
+        expected = ["1:n", "1:n", "1:n", "1:n", "1:n", "1:n"]
+        self.assertEqual(expected, list(msdf.df[MAPPING_CARDINALITY].values))
+        self.assertNotIn(CARDINALITY_SCOPE, msdf.df.columns)