Skip to content

Commit cf75d07

Browse files
authored
Add support for propagating/condensing slots. (#603)
Add two new methods to the MappingSetDataFrame class: `propagate` and `condense`. `propagate` implements propagation of propagatable slots: when such a slot is present on the MappingSet but _not_ in individual records (i.e. the data frame has no column for that slot), the value is "propagated" down to the individual records. That is, a column for that slot is added to the data frame, with the same value for all records, while the original slot is removed from the set metadata. `condense` performs the opposite operation: when a propagatable slot has only a single value across all records, the value is set once and for all in the set metadata and removed from the individual records (the entire column is removed from the data frame).
1 parent ab69eeb commit cf75d07

File tree

4 files changed

+199
-0
lines changed

4 files changed

+199
-0
lines changed

src/sssom/constants.py

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -268,6 +268,16 @@ def double_slots(self) -> Set[str]:
268268
"""Return the slot names for SSSOMSchemaView object."""
269269
return {k for k, v in self.dict["slots"].items() if v["range"] == "double"}
270270

271+
@cached_property
272+
def propagatable_slots(self) -> List[str]:
273+
"""Return the names of all propagatable slots."""
274+
slots = []
275+
for slot_name in self.mapping_set_slots:
276+
annotations = self.view.annotation_dict(slot_name)
277+
if annotations is not None and "propagated" in annotations:
278+
slots.append(slot_name)
279+
return slots
280+
271281

272282
@lru_cache(1)
273283
def _get_sssom_schema_object() -> SSSOMSchemaView:

src/sssom/util.py

Lines changed: 94 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -58,6 +58,7 @@
5858
UNKNOWN_IRI,
5959
MetadataType,
6060
PathOrIO,
61+
SSSOMSchemaView,
6162
_get_sssom_schema_object,
6263
get_default_metadata,
6364
)
@@ -299,6 +300,99 @@ def remove_mappings(self, msdf: "MappingSetDataFrame") -> None:
299300
self.df = self.df[self.df.columns.drop(list(self.df.filter(regex=r"_2")))]
300301
self.clean_prefix_map()
301302

303+
def propagate(self, fill_empty=False) -> List[str]:
304+
"""Propagate slot values from the set level down to individual records.
305+
306+
Propagation, as defined by the SSSOM specification, is the process by
307+
which the values of so-called "propagatable slots" in the set metadata
308+
are moved to the corresponding slots in each individual mapping
309+
records.
310+
311+
Propagation of a slot is only allowed iff no individual records
312+
already have a value for that slot.
313+
314+
:param fill_empty: If True, propagation of a slot is allowed even if
315+
some individual records already have a value for
316+
that slot. The set-level value will be propagated to
317+
all the records for which the slot is empty. Note
318+
that (1) this is not spec-compliant behaviour, and
319+
(2) this makes the operation non-reversible by a
320+
subsequent condensation.
321+
:return: The list of slots that were effectively propagated.
322+
"""
323+
schema = SSSOMSchemaView()
324+
propagated = []
325+
326+
for slot in schema.propagatable_slots:
327+
if slot not in self.metadata: # Nothing to propagate
328+
continue
329+
is_present = slot in self.df.columns
330+
if is_present and not fill_empty:
331+
logging.warning(
332+
f"Not propagating value for '{slot}' because the slot is already set on individual records."
333+
)
334+
continue
335+
336+
if schema.view.get_slot(slot).multivalued:
337+
value = "|".join(self.metadata.pop(slot))
338+
else:
339+
value = self.metadata.pop(slot)
340+
341+
if is_present:
342+
self.df.loc[self.df[slot].eq("") | self.df[slot].isna(), slot] = value
343+
else:
344+
self.df[slot] = value
345+
propagated.append(slot)
346+
347+
return propagated
348+
349+
def condense(self) -> List[str]:
350+
"""Condense record-level slot values to the set whenever possible.
351+
352+
Condensation is the opposite of propagation. It is the process by
353+
which the values of so-called "propagatable" slots found in individual
354+
mapping records are moved to the corresponding slots in the set
355+
metadata.
356+
357+
Condensation of a slot is only allowed iff (1) all records have the
358+
same value for that slot and (2) the slot does not already have a
359+
different value in the set metadata.
360+
361+
:return: The list of slots that were effectively condensed.
362+
"""
363+
schema = SSSOMSchemaView()
364+
condensed = []
365+
366+
for slot in schema.propagatable_slots:
367+
if slot not in self.df.columns: # Nothing to condense
368+
continue
369+
values = self.df[slot].unique()
370+
if len(values) > 1:
371+
# Different values across the records, cannot condense
372+
continue
373+
374+
if schema.view.get_slot(slot).multivalued:
375+
value = values[0].split("|")
376+
else:
377+
value = values[0]
378+
379+
if slot in self.metadata:
380+
if self.metadata[slot] != value:
381+
logging.warning(
382+
f"Not condensing slot '{slot}' because it already has a different value in the set metadata."
383+
)
384+
continue
385+
# No need to set the condensed value in the set metadata as it
386+
# is already there, but we must still remove the column from
387+
# the dataframe
388+
condensed.append(slot)
389+
else:
390+
self.metadata[slot] = value
391+
condensed.append(slot)
392+
393+
self.df.drop(columns=condensed, inplace=True)
394+
return condensed
395+
302396

303397
def _standardize_curie_or_iri(curie_or_iri: str, *, converter: Converter) -> str:
304398
"""Standardize a CURIE or IRI, returning the original if not possible.

tests/data/propagatable.tsv

Lines changed: 22 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,22 @@
1+
#curie_map:
2+
# c: http://example.org/c/
3+
# orcid: https://orcid.org/
4+
# x: http://example.org/x/
5+
# y: http://example.org/y/
6+
# z: http://example.org/z/
7+
#mapping_set_id: https://w3id.org/sssom/mapping/tests/data/propagatable.tsv
8+
#creator_id:
9+
# - orcid:1234
10+
# - orcid:5678
11+
#license: https://creativecommons.org/publicdomain/zero/1.0/
12+
#mapping_provider: https://example.org/mappings
13+
#mapping_tool: https://github.com/cmungall/rdf_matcher
14+
#subject_preprocessing:
15+
# - c:rule1
16+
# - c:rule2
17+
subject_id predicate_id object_id mapping_justification mapping_tool
18+
x:appendage owl:equivalentClass y:appendage semapv:ManualMappingCuration
19+
x:appendage owl:equivalentClass z:appendage semapv:LexicalMatching
20+
x:appendage owl:equivalentClass z:appendage semapv:ManualMappingCuration foo matcher
21+
x:bone_element owl:equivalentClass y:bone semapv:LexicalMatching
22+
x:bone_element owl:equivalentClass y:bone semapv:ManualMappingCuration

tests/test_utils.py

Lines changed: 73 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -522,3 +522,76 @@ def test_get_file_extension(self) -> None:
522522
with self.subTest(path=path, mode="file"), tempfile.TemporaryDirectory() as d:
523523
with Path(d).joinpath(part).open("w") as file:
524524
self.assertEqual(value, get_file_extension(file))
525+
526+
def test_propagation_and_condensation(self) -> None:
527+
"""Test propagating/condensing values of propagatable slots."""
528+
msdf = parse_sssom_table(f"{data_dir}/propagatable.tsv")
529+
530+
propagated_slots = msdf.propagate()
531+
# creator_id is not a propagatable slot
532+
self.assertNotIn("creator_id", propagated_slots)
533+
self.assertNotIn("creator_id", msdf.df.columns)
534+
# mapping_tool has values for some records and should not be propagated
535+
self.assertNotIn("mapping_tool", propagated_slots)
536+
# mapping_provider should be propagated
537+
self.assertIn("mapping_provider", propagated_slots)
538+
self.assertIn("mapping_provider", msdf.df.columns)
539+
self.assertNotIn("mapping_provider", msdf.metadata)
540+
# Ditto for subject_preprocessing
541+
self.assertIn("subject_preprocessing", propagated_slots)
542+
self.assertIn("subject_preprocessing", msdf.df.columns)
543+
self.assertNotIn("subject_preprocessing", msdf.metadata)
544+
545+
propagated_slots = msdf.propagate()
546+
# Set has been propagated already, no further propagation possible
547+
self.assertEqual(0, len(propagated_slots))
548+
549+
condensed_slots = msdf.condense()
550+
# mapping_tool has not a unique value and should not be condensed
551+
self.assertNotIn("mapping_tool", condensed_slots)
552+
self.assertIn("mapping_tool", msdf.df.columns)
553+
# mapping_provider should be condensed back
554+
self.assertIn("mapping_provider", condensed_slots)
555+
self.assertNotIn("mapping_provider", msdf.df.columns)
556+
self.assertIn("mapping_provider", msdf.metadata)
557+
# Ditto for subject_preprocessing
558+
self.assertIn("subject_preprocessing", condensed_slots)
559+
self.assertNotIn("subject_preprocessing", msdf.df.columns)
560+
self.assertIn("subject_preprocessing", msdf.metadata)
561+
562+
condensed_slots = msdf.condense()
563+
# Set has been condensed already, no further condensation possible
564+
self.assertEqual(0, len(condensed_slots))
565+
566+
def test_condensation_with_existing_set_values(self) -> None:
567+
"""Test that condensation works as expected with the mapping set already contains values for the to-be-condensed slots."""
568+
msdf = parse_sssom_table(f"{data_dir}/propagatable.tsv")
569+
msdf.propagate()
570+
# Following propagation, all records in msdf have the same
571+
# mapping_provider ("https://example.org/mappings/)"
572+
573+
# Inject a different mapping_provider value in the set metadata;
574+
# this should prevent that slot from being condensed back
575+
msdf.metadata["mapping_provider"] = "https://example.org/mappings/2"
576+
condensed_slots = msdf.condense()
577+
self.assertNotIn("mapping_provider", condensed_slots)
578+
self.assertIn("mapping_provider", msdf.df.columns)
579+
self.assertEqual("https://example.org/mappings/2", msdf.metadata["mapping_provider"])
580+
581+
# Inject the same mapping_provider value as the one contained in
582+
# the records; this should allow the slot to be condensed
583+
msdf.metadata["mapping_provider"] = "https://example.org/mappings"
584+
condensed_slots = msdf.condense()
585+
self.assertIn("mapping_provider", condensed_slots)
586+
self.assertNotIn("mapping_provider", msdf.df.columns)
587+
self.assertEqual("https://example.org/mappings", msdf.metadata["mapping_provider"])
588+
589+
def test_propagation_fill_empty_mode(self) -> None:
590+
"""Test propagate with fill_empty=True."""
591+
msdf = parse_sssom_table(f"{data_dir}/propagatable.tsv")
592+
593+
propagated_slots = msdf.propagate(fill_empty=True)
594+
# mapping_tool should have been propagated
595+
self.assertIn("mapping_tool", propagated_slots)
596+
self.assertNotIn("mapping_tool", msdf.metadata)
597+
self.assertEqual(2, len(msdf.df["mapping_tool"].unique()))

0 commit comments

Comments
 (0)