Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
59 changes: 24 additions & 35 deletions src/sssom/util.py
Original file line number Diff line number Diff line change
Expand Up @@ -47,7 +47,7 @@
MAPPING_CARDINALITY,
MAPPING_JUSTIFICATION,
MAPPING_SET_ID,
MAPPING_SET_SOURCE,
MAPPING_SOURCE,
NO_TERM_FOUND,
OBJECT_CATEGORY,
OBJECT_ID,
Expand Down Expand Up @@ -1118,25 +1118,35 @@ def merge_msdf(
:param reconcile: If reconcile=True, then dedupe(remove redundant lower confidence mappings) and
reconcile (if msdf contains a higher confidence _negative_ mapping, then remove lower
confidence positive one. If confidence is the same, prefer HumanCurated. If both
HumanCurated, prefer negative mapping). Defaults to True.
HumanCurated, prefer negative mapping). Defaults to False.

:returns: Merged MappingSetDataFrame.
"""
# Inject metadata of msdf into df
msdf_with_meta = [inject_metadata_into_df(msdf) for msdf in msdfs]

# merge df [# 'outer' join in pandas == FULL JOIN in SQL]
# df_merged = reduce(
# lambda left, right: left.merge(right, how="outer", on=list(left.columns)),
# [msdf.df for msdf in msdf_with_meta],
# )
# Concat is an alternative to merge when columns are not the same.
# Propagate slots, inject source if possible
source_injected = 0
for msdf in msdfs:
msdf.propagate()
if MAPPING_SET_ID in msdf.metadata and MAPPING_SOURCE not in msdf.df.columns:
msdf.df[MAPPING_SOURCE] = msdf.metadata[MAPPING_SET_ID]
source_injected += 1

columns = set([c for msdf in msdfs for c in msdf.df.columns])
if source_injected > 1:
# If we injected a mapping_source slot into each individual
# record for at least two of the input sets, then we must ignore
# that slot when attempting to remove duplicates below, because
# the mere presence of that slot would cause two identical
# records to appear different just because they come from
# different sources (which they would not do if we had not
# injected the mapping_source above).
columns.remove(MAPPING_SOURCE)

df_merged = reduce(
lambda left, right: pd.concat([left, right], axis=0, ignore_index=True),
[msdf.df for msdf in msdf_with_meta],
).drop_duplicates(ignore_index=True)
[msdf.df for msdf in msdfs],
).drop_duplicates(ignore_index=True, subset=columns)

converter = curies.chain([msdf.converter for msdf in msdf_with_meta])
converter = curies.chain([msdf.converter for msdf in msdfs])
merged_msdf = MappingSetDataFrame.with_converter(df=df_merged, converter=converter)
if reconcile:
merged_msdf.df = filter_redundant_rows(merged_msdf.df)
Expand Down Expand Up @@ -1296,27 +1306,6 @@ def deal_with_negation(df: pd.DataFrame) -> pd.DataFrame:
return return_df


def inject_metadata_into_df(msdf: MappingSetDataFrame) -> MappingSetDataFrame:
"""Inject metadata dictionary key-value pair into DataFrame columns in a MappingSetDataFrame.DataFrame.

:param msdf: MappingSetDataFrame with metadata separate.

:returns: MappingSetDataFrame with metadata as columns
"""
# TODO add this into the "standardize" function introduced in
# https://github.com/mapping-commons/sssom-py/pull/438
# TODO Check if 'k' is a valid 'slot' for 'mapping' [sssom.yaml]
slots = SSSOMSchemaView().mapping_slots
for k, v in msdf.metadata.items():
if k not in msdf.df.columns and k in slots:
if k == MAPPING_SET_ID:
k = MAPPING_SET_SOURCE
if isinstance(v, list):
v = "|".join(x for x in v)
msdf.df[k] = str(v)
return msdf


ExtensionLiteral = Literal["tsv", "csv"]


Expand Down
2 changes: 1 addition & 1 deletion tests/test_merge.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@ def setUp(self) -> None:
def test_merge_multiple_inputs(self) -> None:
"""Test merging of multiple msdfs."""
merged_msdf = merge_msdf(*self.msdfs)
self.assertEqual(275, len(merged_msdf.df))
self.assertEqual(200, len(merged_msdf.df))
Copy link
Contributor Author

@gouttegd gouttegd Oct 8, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Same explanation as above: the previous version of merge_msdf did not drop all duplicates because of the incorrect propagation of slots that should not have been propagated.


def test_merge_single_input(self) -> None:
"""Test merging when a single msdf is provided."""
Expand Down
2 changes: 1 addition & 1 deletion tests/test_reconcile.py
Original file line number Diff line number Diff line change
Expand Up @@ -41,7 +41,7 @@ def test_merge(self) -> None:
msdf3 = parse_sssom_table(data_dir / "basic.tsv")
merged_msdf1 = merge_msdf(self.msdf1, msdf3)

self.assertEqual(152, len(merged_msdf1.df))
self.assertEqual(149, len(merged_msdf1.df))
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This difference is expected, and the new behaviour is the correct one.

The merge of msdf1 (the basic3.tsv file) and msdf3 (the basic.tsv file) contains three records that are identical, but the previous version of merge_msdf failed to consider them as duplicates because of the incorrect propagation of the creator_id slot (basic3.tsv and basic.tsv have different values for creator_id, so as part of the merge operation all records in the msdf1 got one creator_id value, and all records in the msdf3 set got another creator_id value, resulting in all records in the merged set being different).


merged_msdf2 = merge_msdf(self.msdf2, msdf3)
self.assertEqual(174, len(merged_msdf2.df))
Expand Down
9 changes: 0 additions & 9 deletions tests/test_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -38,7 +38,6 @@
get_dict_from_mapping,
get_file_extension,
get_prefixes_used_in_table,
inject_metadata_into_df,
invert_mappings,
is_multivalued_slot,
)
Expand Down Expand Up @@ -213,14 +212,6 @@ def test_invert_asymmetric_nodes(self) -> None:
inverted_object_labels = inverted_df["object_label"].values
self.assertNotIn(False, original_subject_labels == inverted_object_labels)

def test_inject_metadata_into_df(self) -> None:
"""Test injecting metadata into DataFrame is as expected."""
expected_creators = "orcid:0000-0001-5839-2535|orcid:0000-0001-5839-2532"
msdf = parse_sssom_table(f"{data_dir}/test_inject_metadata_msdf.tsv")
msdf_with_meta = inject_metadata_into_df(msdf)
creator_ids = msdf_with_meta.df["creator_id"].drop_duplicates().values.item()
self.assertEqual(creator_ids, expected_creators)


class TestUtils(unittest.TestCase):
"""Unit test for utility functions."""
Expand Down
Loading