|
23 | 23 | from sssom_schema import MappingSet, slots |
24 | 24 |
|
25 | 25 | from .constants import ( |
| 26 | + CARDINALITY_SCOPE, |
26 | 27 | COLUMN_INVERT_DICTIONARY, |
27 | 28 | COMMENT, |
28 | 29 | CONFIDENCE, |
| 30 | + MAPPING_CARDINALITY, |
29 | 31 | MAPPING_JUSTIFICATION, |
30 | 32 | MAPPING_SET_ID, |
31 | 33 | MAPPING_SET_SOURCE, |
| 34 | + NO_TERM_FOUND, |
32 | 35 | OBJECT_CATEGORY, |
33 | 36 | OBJECT_ID, |
34 | 37 | OBJECT_LABEL, |
@@ -393,6 +396,106 @@ def condense(self) -> List[str]: |
393 | 396 | self.df.drop(columns=condensed, inplace=True) |
394 | 397 | return condensed |
395 | 398 |
|
| 399 | + def infer_cardinality(self, scope: Optional[List[str]] = None) -> None: |
| 400 | + """Infer cardinality values in the set. |
| 401 | +
|
| 402 | + This method will automatically fill the `mapping_cardinality` slot for |
| 403 | + all records in the set, overwriting any pre-existing values. |
| 404 | +
|
| 405 | + See <https://mapping-commons.github.io/sssom/spec-model/#mapping-cardinality-and-cardinality-scope> |
| 406 | + for more information about cardinality computation, |
| 407 | + <https://mapping-commons.github.io/sssom/spec-model/#literal-mappings> |
| 408 | + for how to deal with literal mapping records, and |
| 409 | + <https://mapping-commons.github.io/sssom/spec-model/#representing-unmapped-entities> |
| 410 | + for how to deal with mapping records involving `sssom:NoTermFound`. |
| 411 | +
|
| 412 | + :param scope: A list of slot names that defines the subset of the |
| 413 | + records in which cardinality will be computed. For |
| 414 | + example, with a scope of `['predicate_id']`, for any |
| 415 | + given record the cardinality will be computed relatively |
| 416 | + to the subset of records that have the same predicate. |
| 417 | + The default is an empty list, meaning that cardinality is |
| 418 | + computed relatively to the entire set of records. |
| 419 | + """ |
| 420 | + if scope is None: |
| 421 | + scope = [] |
| 422 | + |
| 423 | + #: Unique subjects for any given object |
| 424 | + subjects_by_object: defaultdict[str, set[str]] = defaultdict(set) |
| 425 | + #: Unique objects for any given subject |
| 426 | + objects_by_subject: defaultdict[str, set[str]] = defaultdict(set) |
| 427 | + |
| 428 | + schema = SSSOMSchemaView() |
| 429 | + unknown_slots = [slot for slot in scope if slot not in schema.mapping_slots] |
| 430 | + if len(unknown_slots) > 0: |
| 431 | + logging.warning(f"Ignoring invalid slot name(s): {unknown_slots}.") |
| 432 | + scope = list(set(scope) - set(unknown_slots)) |
| 433 | + |
| 434 | + # Helper function to transform a row into a string that represents |
| 435 | + # a subject (or object) in a given scope; `side` is either `subject` |
| 436 | + # or `object`. |
| 437 | + def _to_string(row: dict[str, Any], side: str) -> str: |
| 438 | + # We prepend a one-letter code (`L` or `E`) to the actual subject |
| 439 | + # or object so that literal and non-literal mapping records are |
| 440 | + # always distinguishable and can be counted separately. |
| 441 | + if row.get(f"{side}_type") == "rdfs literal": |
| 442 | + s = "L\0" + row.get(f"{side}_label", "") |
| 443 | + else: |
| 444 | + s = "E\0" + row.get(f"{side}_id", "") |
| 445 | + for slot in scope: |
| 446 | + s += "\0" + row.get(slot, "") |
| 447 | + return s |
| 448 | + |
| 449 | + # We iterate over the records a first time to collect the different |
| 450 | + # objects mapped to each subject and vice versa |
| 451 | + for _, row in self.df.iterrows(): |
| 452 | + if row.get(SUBJECT_ID) == NO_TERM_FOUND or row.get(OBJECT_ID) == NO_TERM_FOUND: |
| 453 | + # Mappings to sssom:NoTermFound are ignored for cardinality computations |
| 454 | + continue |
| 455 | + |
| 456 | + subj = _to_string(row, "subject") |
| 457 | + obj = _to_string(row, "object") |
| 458 | + |
| 459 | + subjects_by_object[obj].add(subj) |
| 460 | + objects_by_subject[subj].add(obj) |
| 461 | + |
| 462 | + # Second iteration to compute the actual cardinality values. Since we |
| 463 | + # must not modify a row while we are iterating over the dataframe, we |
| 464 | + # collect the values in a separate array. |
| 465 | + cards = [] |
| 466 | + for _, row in self.df.iterrows(): |
| 467 | + # Special cases involving sssom:NoTermFound on either side |
| 468 | + if row.get(SUBJECT_ID) == NO_TERM_FOUND: |
| 469 | + if row.get(OBJECT_ID) == NO_TERM_FOUND: |
| 470 | + cards.append("0:0") |
| 471 | + else: |
| 472 | + cards.append("0:1") |
| 473 | + elif row.get(OBJECT_ID) == NO_TERM_FOUND: |
| 474 | + cards.append("1:0") |
| 475 | + else: |
| 476 | + # General case |
| 477 | + n_subjects = len(subjects_by_object[_to_string(row, "object")]) |
| 478 | + n_objects = len(objects_by_subject[_to_string(row, "subject")]) |
| 479 | + |
| 480 | + if n_subjects == 1: |
| 481 | + if n_objects == 1: |
| 482 | + cards.append("1:1") |
| 483 | + else: |
| 484 | + cards.append("1:n") |
| 485 | + else: |
| 486 | + if n_objects == 1: |
| 487 | + cards.append("n:1") |
| 488 | + else: |
| 489 | + cards.append("n:n") |
| 490 | + |
| 491 | + # Add the computed values to the dataframe |
| 492 | + self.df[MAPPING_CARDINALITY] = cards |
| 493 | + if len(scope) > 0: |
| 494 | + self.df[CARDINALITY_SCOPE] = "|".join(scope) |
| 495 | + else: |
| 496 | + # No scope, so remove any pre-existing "cardinality_scope" column |
| 497 | + self.df.drop(columns=CARDINALITY_SCOPE, inplace=True, errors="ignore") |
| 498 | + |
396 | 499 |
|
397 | 500 | def _standardize_curie_or_iri(curie_or_iri: str, *, converter: Converter) -> str: |
398 | 501 | """Standardize a CURIE or IRI, returning the original if not possible. |
|
0 commit comments