Add the infer_cardinality method.

gouttegd · gouttegd · commit cbbbe48017c5 · 2025-09-01T23:47:17.000+01:00
Add a new `infer_cardinality` method to the `MappingSetDataFrame` to
fill the `mapping_cardinality` slot with computed cardinality values.

The approach used here is more or less a direct Python translation of my
existing implementation in SSSOM-Java.

The gist of it is that we iterate over the entire set of records a first
time to populate two hash tables: one that associates a subject to all
the different objects it is mapped to, and one that associates an object
to all the different subjects it is mapped to. Then we can iterate over
the records a second time, and for every record we can immediately get
(1) the number of different objects mapped to the same subject and (2)
the number of different subjects mapped to the same object; the
combination of those two values gives us the cardinality we are looking
for.

To deal with the concept of "scope", the "subjects" and "objects" that
we use to fill the hash tables are not made of only the "subject_id"
slot or the "object_id" slot, but also of all the slots that define the
scope. For example, if the scope is `["predicate_id"]`, then for the
following record:

  subject_id   predicate_id      object_id
  DO:1234      skos:exactMatch   HP:5678

the "subject" string will contain both `DO:1234` and `skos:exactMatch`,
and the "object" string will contain both `HP:5678` and
`skos:exactMatch`.
diff --git a/src/sssom/util.py b/src/sssom/util.py
@@ -393,6 +393,100 @@ def condense(self) -> List[str]:
         self.df.drop(columns=condensed, inplace=True)
         return condensed
 
+    def infer_cardinality(self, scope: List[str] = None) -> None:
+        """Infer cardinality values in the set.
+
+        This method will automatically fill the `mapping_cardinality` slot for
+        all records in the set, overwriting any pre-existing values.
+
+        See <https://mapping-commons.github.io/sssom/spec-model/#mapping-cardinality-and-cardinality-scope>
+        for more information about cardinality computation,
+        <https://mapping-commons.github.io/sssom/spec-model/#literal-mappings>
+        for how to deal with literal mapping records, and
+        <https://mapping-commons.github.io/sssom/spec-model/#representing-unmapped-entities>
+        for how to deal with mapping records involving `sssom:NoTermFound`.
+
+        :param scope: A list of slot names that defines the subset of the
+                      records in which cardinality will be computed. For
+                      example, with a scope of `['predicate_id']`, for any
+                      given record the cardinality will be computed relatively
+                      to the subset of records that have the same predicate.
+                      The default is an empty list, meaning that cardinality is
+                      computed relatively to the entire set of records.
+        """
+        if scope is None:
+            scope = []
+        subjects_by_object = {}  # Unique subjects for any given object
+        objects_by_subject = {}  # Unique objects for any given subject
+
+        # Helper function to transform a row into a string that represents
+        # a subject (or object) in a given scope; `side` is either `subject`
+        # or `object`.
+        def _to_string(row, side):
+            # We prepend a one-letter code (`L` or `E`) to the actual subject
+            # or object so that literal and non-literal mapping records are
+            # always distinguishable and can be counted separately.
+            if row.get(f"{side}_type") == "rdfs literal":
+                s = "L\0" + row.get(f"{side}_label", "")
+            else:
+                s = "E\0" + row.get(f"{side}_id", "")
+            for slot in scope:
+                s += "\0" + row.get(slot, "")
+            return s
+
+        # We iterate over the records a first time to collect the different
+        # objects mapped to each subject and vice versa
+        for _, row in self.df.iterrows():
+            if (
+                row.get("subject_id") == "sssom:NoTermFound"
+                or row.get("object_id") == "sssom:NoTermFound"
+            ):
+                # Mappings to sssom:NoTermFound are ignored for cardinality computations
+                continue
+
+            subj = _to_string(row, "subject")
+            obj = _to_string(row, "object")
+
+            subjects_by_object.setdefault(obj, set()).add(subj)
+            objects_by_subject.setdefault(subj, set()).add(obj)
+
+        # Second iteration to compute the actual cardinality values. Since we
+        # must not modify a row while we are iterating over the dataframe, we
+        # collect the values in a separate array.
+        cards = []
+        for _, row in self.df.iterrows():
+            # Special cases involving sssom:NoTermFound on either side
+            if row.get("subject_id") == "sssom:NoTermFound":
+                if row.get("object_id") == "sssom:NoTermFound":
+                    cards.append("0:0")
+                else:
+                    cards.append("0:1")
+            elif row.get("object_id") == "sssom:NoTermFound":
+                cards.append("1:0")
+            else:
+                # General case
+                n_subjects = len(subjects_by_object[_to_string(row, "object")])
+                n_objects = len(objects_by_subject[_to_string(row, "subject")])
+
+                if n_subjects == 1:
+                    if n_objects == 1:
+                        cards.append("1:1")
+                    else:
+                        cards.append("1:n")
+                else:
+                    if n_objects == 1:
+                        cards.append("n:1")
+                    else:
+                        cards.append("n:n")
+
+        # Add the computed values to the dataframe
+        self.df["mapping_cardinality"] = cards
+        if len(scope) > 0:
+            self.df["cardinality_scope"] = "|".join(scope)
+        else:
+            # No scope, so remove any pre-existing "cardinality_scope" column
+            self.df.drop(columns="cardinality_scope", inplace=True, errors="ignore")
+
 
 def _standardize_curie_or_iri(curie_or_iri: str, *, converter: Converter) -> str:
     """Standardize a CURIE or IRI, returning the original if not possible.