diff --git a/src/sssom/constants.py b/src/sssom/constants.py index b858405c..a200096f 100644 --- a/src/sssom/constants.py +++ b/src/sssom/constants.py @@ -89,6 +89,7 @@ MAPPING_SET_SOURCE = "mapping_set_source" MAPPING_SOURCE = "mapping_source" MAPPING_CARDINALITY = "mapping_cardinality" +CARDINALITY_SCOPE = "cardinality_scope" MAPPING_TOOL = "mapping_tool" MAPPING_TOOL_VERSION = "mapping_tool_version" MAPPING_DATE = "mapping_date" @@ -109,6 +110,10 @@ SUBJECT_SOURCE_ID = "subject_source_id" OBJECT_SOURCE_ID = "object_source_id" +# Special value for "unmapped" entities +# see +NO_TERM_FOUND = "sssom:NoTermFound" + # PREDICATES OWL_EQUIVALENT_CLASS = "owl:equivalentClass" OWL_EQUIVALENT_PROPERTY = "owl:equivalentProperty" diff --git a/src/sssom/util.py b/src/sssom/util.py index b49d08fd..36ce693d 100644 --- a/src/sssom/util.py +++ b/src/sssom/util.py @@ -23,12 +23,15 @@ from sssom_schema import MappingSet, slots from .constants import ( + CARDINALITY_SCOPE, COLUMN_INVERT_DICTIONARY, COMMENT, CONFIDENCE, + MAPPING_CARDINALITY, MAPPING_JUSTIFICATION, MAPPING_SET_ID, MAPPING_SET_SOURCE, + NO_TERM_FOUND, OBJECT_CATEGORY, OBJECT_ID, OBJECT_LABEL, @@ -393,6 +396,103 @@ def condense(self) -> List[str]: self.df.drop(columns=condensed, inplace=True) return condensed + def infer_cardinality(self, scope: List[str] = None) -> None: + """Infer cardinality values in the set. + + This method will automatically fill the `mapping_cardinality` slot for + all records in the set, overwriting any pre-existing values. + + See + for more information about cardinality computation, + + for how to deal with literal mapping records, and + + for how to deal with mapping records involving `sssom:NoTermFound`. + + :param scope: A list of slot names that defines the subset of the + records in which cardinality will be computed. For + example, with a scope of `['predicate_id']`, for any + given record the cardinality will be computed relatively + to the subset of records that have the same predicate. + The default is an empty list, meaning that cardinality is + computed relatively to the entire set of records. + """ + if scope is None: + scope = [] + subjects_by_object: dict[str, set[str]] = {} # Unique subjects for any given object + objects_by_subject: dict[str, set[str]] = {} # Unique objects for any given subject + + schema = SSSOMSchemaView() + unknown_slots = [slot for slot in scope if slot not in schema.mapping_slots] + if len(unknown_slots) > 0: + logging.warning(f"Ignoring invalid slot name(s): {unknown_slots}.") + scope = list(set(scope) - set(unknown_slots)) + + # Helper function to transform a row into a string that represents + # a subject (or object) in a given scope; `side` is either `subject` + # or `object`. + def _to_string(row, side): + # We prepend a one-letter code (`L` or `E`) to the actual subject + # or object so that literal and non-literal mapping records are + # always distinguishable and can be counted separately. + if row.get(f"{side}_type") == "rdfs literal": + s = "L\0" + row.get(f"{side}_label", "") + else: + s = "E\0" + row.get(f"{side}_id", "") + for slot in scope: + s += "\0" + row.get(slot, "") + return s + + # We iterate over the records a first time to collect the different + # objects mapped to each subject and vice versa + for _, row in self.df.iterrows(): + if row.get(SUBJECT_ID) == NO_TERM_FOUND or row.get(OBJECT_ID) == NO_TERM_FOUND: + # Mappings to sssom:NoTermFound are ignored for cardinality computations + continue + + subj = _to_string(row, "subject") + obj = _to_string(row, "object") + + subjects_by_object.setdefault(obj, set()).add(subj) + objects_by_subject.setdefault(subj, set()).add(obj) + + # Second iteration to compute the actual cardinality values. Since we + # must not modify a row while we are iterating over the dataframe, we + # collect the values in a separate array. + cards = [] + for _, row in self.df.iterrows(): + # Special cases involving sssom:NoTermFound on either side + if row.get(SUBJECT_ID) == NO_TERM_FOUND: + if row.get(OBJECT_ID) == NO_TERM_FOUND: + cards.append("0:0") + else: + cards.append("0:1") + elif row.get(OBJECT_ID) == NO_TERM_FOUND: + cards.append("1:0") + else: + # General case + n_subjects = len(subjects_by_object[_to_string(row, "object")]) + n_objects = len(objects_by_subject[_to_string(row, "subject")]) + + if n_subjects == 1: + if n_objects == 1: + cards.append("1:1") + else: + cards.append("1:n") + else: + if n_objects == 1: + cards.append("n:1") + else: + cards.append("n:n") + + # Add the computed values to the dataframe + self.df[MAPPING_CARDINALITY] = cards + if len(scope) > 0: + self.df[CARDINALITY_SCOPE] = "|".join(scope) + else: + # No scope, so remove any pre-existing "cardinality_scope" column + self.df.drop(columns=CARDINALITY_SCOPE, inplace=True, errors="ignore") + def _standardize_curie_or_iri(curie_or_iri: str, *, converter: Converter) -> str: """Standardize a CURIE or IRI, returning the original if not possible. diff --git a/tests/data/cardinality-scope.sssom.tsv b/tests/data/cardinality-scope.sssom.tsv new file mode 100644 index 00000000..2dc7613d --- /dev/null +++ b/tests/data/cardinality-scope.sssom.tsv @@ -0,0 +1,14 @@ +#curie_map: +# COMENT: https://example.com/entities/ +# NETENT: https://example.net/entities/ +# ORGENT: https://example.org/entities/ +# SRC: https://example.org/sources/ +#mapping_set_id: https://example.org/sets/cardinality-scope-empty +#license: https://creativecommons.org/licenses/by/4.0/ +subject_id subject_label predicate_id object_id object_label mapping_justification object_source mapping_cardinality +ORGENT:0001 alice skos:closeMatch COMENT:0011 alpha semapv:ManualMappingCuration SRC:com 1:n +ORGENT:0001 alice skos:closeMatch NETENT:0111 alpha semapv:ManualMappingCuration SRC:net 1:n +ORGENT:0002 bob skos:closeMatch COMENT:0012 beta semapv:ManualMappingCuration SRC:com 1:n +ORGENT:0002 bob skos:closeMatch NETENT:0112 bravo semapv:ManualMappingCuration SRC:net 1:n +ORGENT:0007 gavin skos:closeMatch NETENT:0117 golf semapv:ManualMappingCuration SRC:net 1:n +ORGENT:0007 gavin skos:exactMatch COMENT:0013 gamma semapv:ManualMappingCuration SRC:com 1:n diff --git a/tests/data/cardinality-with-NoTermFound.sssom.tsv b/tests/data/cardinality-with-NoTermFound.sssom.tsv new file mode 100644 index 00000000..f4e24d13 --- /dev/null +++ b/tests/data/cardinality-with-NoTermFound.sssom.tsv @@ -0,0 +1,11 @@ +#curie_map: +# OBJ: https://example.org/object/ +# SRC: https://example.org/sources/ +# SUBJ: https://example.org/subject/ +#mapping_set_id: https://example.org/sets/cardinality-with-unmapped-entities +#license: https://creativecommons.org/licenses/by/4.0/ +subject_id predicate_id object_id mapping_justification subject_source object_source mapping_cardinality comment +SUBJ:0001 skos:exactMatch sssom:NoTermFound semapv:ManualMappingCuration SRC:A SRC:B 1:0 S1 in vocabulary A has no exact match in vocabulary B +SUBJ:0001 skos:closeMatch OBJ:0001 semapv:ManualMappingCuration SRC:A SRC:B 1:1 S1 mapped only to O1, O1 mapped only to S1 -- the record involving sssom:NoTermFound does not count, as it is an absence of match rather than an actual mapping +sssom:NoTermFound skos:exactMatch OBJ:0002 semapv:ManualMappingCuration SRC:C SRC:D 0:1 O2 in vocabulary D has no exact match in vocabulary C +sssom:NoTermFound skos:exactMatch sssom:NoTermFound semapv:ManualMappingCuration SRC:E SRC:F 0:0 No exact match between any term from vocabulary E and any term for vocabulary F (in other words, the two vocabularies are completely disjoint, at least as far as exact matches are considered) diff --git a/tests/data/cardinality-with-literal-mappings.sssom.tsv b/tests/data/cardinality-with-literal-mappings.sssom.tsv new file mode 100644 index 00000000..7e35bc81 --- /dev/null +++ b/tests/data/cardinality-with-literal-mappings.sssom.tsv @@ -0,0 +1,8 @@ +#curie_map: +# OBJ: https://example.org/object/ +# SUBJ: https://example.org/subject/ +#mapping_set_id: https://example.org/sets/cardinality-with-literal-mappings +#license: https://creativecommons.org/licenses/by/4.0/ +subject_id predicate_id object_id object_label mapping_justification object_type mapping_cardinality comment +SUBJ:0001 skos:exactMatch OBJ:0001 semapv:LexicalMatching 1:n S1 mapped to O1 (entity) and O1 (literal) +SUBJ:0001 skos:exactMatch OBJ:0001 OBJ:0001 semapv:MappingReview rdfs literal 1:n S1 mapped to O1 (entity) and O1 (literal) diff --git a/tests/data/cardinality.sssom.tsv b/tests/data/cardinality.sssom.tsv new file mode 100644 index 00000000..24f57a33 --- /dev/null +++ b/tests/data/cardinality.sssom.tsv @@ -0,0 +1,15 @@ +#curie_map: +# OBJ: https://example.org/object/ +# SUBJ: https://example.org/subject/ +#mapping_set_id: https://example.org/sets/cardinality +#license: https://creativecommons.org/licenses/by/4.0/ +subject_id predicate_id object_id mapping_justification mapping_cardinality comment +SUBJ:0001 skos:exactMatch OBJ:0001 semapv:LexicalMatching 1:1 S1 and O1 only mapped to each other +SUBJ:0001 skos:exactMatch OBJ:0001 semapv:MappingReview 1:1 S1 and O1 only mapped to each other +SUBJ:0002 skos:exactMatch OBJ:0002 semapv:LexicalMatching 1:n S2 mapped to both O2 and O3, O2 mapped only to S2 +SUBJ:0002 skos:exactMatch OBJ:0003 semapv:LexicalMatching 1:n S2 mapped to both O2 and O3, O3 mapped only to S2 +SUBJ:0003 skos:exactMatch OBJ:0004 semapv:LexicalMatching n:1 S3 and S4 both mapped to only O4 +SUBJ:0004 skos:exactMatch OBJ:0004 semapv:LexicalMatching n:1 S3 and S4 both mapped to only O4 +SUBJ:0005 skos:exactMatch OBJ:0005 semapv:LexicalMatching n:n S5 mapped to O5 and O6, O5 mapped to S5 and S6 +SUBJ:0005 skos:exactMatch OBJ:0006 semapv:LexicalMatching 1:n S5 mapped to O5 and O6, O6 mapped only to S5 +SUBJ:0006 skos:exactMatch OBJ:0005 semapv:LexicalMatching n:1 S6 mapped only to O5, O5 mapped to both S5 and S6 diff --git a/tests/test_utils.py b/tests/test_utils.py index 91e187d0..4fdb0369 100644 --- a/tests/test_utils.py +++ b/tests/test_utils.py @@ -12,7 +12,9 @@ from sssom_schema import slots as SSSOM_Slots from sssom.constants import ( + CARDINALITY_SCOPE, CREATOR_ID, + MAPPING_CARDINALITY, OBJECT_ID, OBJECT_LABEL, PREDICATE_ID, @@ -595,3 +597,40 @@ def test_propagation_fill_empty_mode(self) -> None: self.assertIn("mapping_tool", propagated_slots) self.assertNotIn("mapping_tool", msdf.metadata) self.assertEqual(2, len(msdf.df["mapping_tool"].unique())) + + def test_infer_cardinality(self) -> None: + """Test cardinality computation.""" + + def _check_against_precomputed_values(filename): + msdf = parse_sssom_table(f"{data_dir}/{filename}") + # Expected values are already contained in the test file + expected = list(msdf.df[MAPPING_CARDINALITY].values) + msdf.df.drop(columns=MAPPING_CARDINALITY, inplace=True) + msdf.infer_cardinality() + self.assertEqual(expected, list(msdf.df[MAPPING_CARDINALITY].values)) + + _check_against_precomputed_values("cardinality.sssom.tsv") + _check_against_precomputed_values("cardinality-with-NoTermFound.sssom.tsv") + _check_against_precomputed_values("cardinality-with-literal-mappings.sssom.tsv") + + def test_infer_scoped_cardinality(self) -> None: + """Test cardinality computation with scopes.""" + msdf = parse_sssom_table(f"{data_dir}/cardinality-scope.sssom.tsv") + + msdf.infer_cardinality(["predicate_id"]) + expected = ["1:n", "1:n", "1:n", "1:n", "1:1", "1:1"] + self.assertEqual(expected, list(msdf.df[MAPPING_CARDINALITY].values)) + + msdf.infer_cardinality(["object_source"]) + expected = ["1:1", "1:1", "1:1", "1:1", "1:1", "1:1"] + self.assertEqual(expected, list(msdf.df[MAPPING_CARDINALITY].values)) + + msdf.infer_cardinality(["object_source", "not_a_valid_slot_name"]) + # should yield the same result as above + self.assertEqual(expected, list(msdf.df[MAPPING_CARDINALITY].values)) + + msdf.infer_cardinality(["not_a_valid_slot_name"]) + # should be equivalent to an empty scope + expected = ["1:n", "1:n", "1:n", "1:n", "1:n", "1:n"] + self.assertEqual(expected, list(msdf.df[MAPPING_CARDINALITY].values)) + self.assertNotIn(CARDINALITY_SCOPE, msdf.df.columns)