From 65e4ea31657f076bea0b54f177dea0a838ae1ef3 Mon Sep 17 00:00:00 2001 From: Nico Matentzoglu Date: Thu, 3 Aug 2023 16:17:46 +0300 Subject: [PATCH 1/3] Add SSSOM rewire to rewire method --- src/sssom/cli.py | 27 +++++++++++++++++++++------ src/sssom/util.py | 8 ++++++++ 2 files changed, 29 insertions(+), 6 deletions(-) diff --git a/src/sssom/cli.py b/src/sssom/cli.py index aab38d65..b5e5b698 100644 --- a/src/sssom/cli.py +++ b/src/sssom/cli.py @@ -42,7 +42,7 @@ split_file, validate_file, ) -from .parsers import parse_sssom_table +from .parsers import from_sssom_dataframe, parse_sssom_table from .rdf_util import rewire_graph from .sparql_util import EndpointConfig, query_mappings from .util import ( @@ -548,11 +548,26 @@ def rewire( # noqa: DAR101 """ msdf = parse_sssom_table(mapping_file) - g = Graph() - g.parse(input, format=input_format) - rewire_graph(g, msdf, precedence=precedence) - rdfstr = g.serialize(format=output_format) - print(rdfstr, file=output) + + if input_format=="sssom-tsv" or input.endswith("sssom.tsv"): + msdf_mapping = parse_sssom_table(input) + df_rewired = rewire_sssom_table() # This is the method you need to implement + + # updating the metadata of the rewired df so you can recognise it was rewired? + metadata = msdf.metadata + metadata["mapping_set_id"] = msdf["mapping_set_id"]+"rewired.sssom.tsv" + + # This maybe has to be revisited as the rewiring can change the SSSOM mapping + prefix_map = msdf.prefix_map + + msdf_rewired = from_sssom_dataframe(df_rewired, prefix_map=prefix_map, meta=metadata) + write_table(msdf_rewired, output) + else: + g = Graph() + g.parse(input, format=input_format) + rewire_graph(g, msdf, precedence=precedence) + outstring = g.serialize(format=output_format) + print(outstring, file=output) @main.command() diff --git a/src/sssom/util.py b/src/sssom/util.py index 7ced7b1f..82b2bf3a 100644 --- a/src/sssom/util.py +++ b/src/sssom/util.py @@ -394,6 +394,14 @@ def get_row_based_on_hierarchy(df: pd.DataFrame): if not hierarchical_df.empty: return hierarchical_df +def rewire_sssom_table(df_rewire: pd.DataFrame, df_mapping: pd.DataFrame): + # 1. Standardise subject and object id columns using + # https://curies.readthedocs.io/en/latest/api/curies.Converter.html#curies.Converter.pd_standardize_curie + # 2. Perform the rewiring + # 3. Store some metadata in the "other" field? + # 4. Return back out + result_df = ... + return result_df def assign_default_confidence( df: pd.DataFrame, From 60b574fe9091b68447b6e44e12cff8ae4f8a4ea8 Mon Sep 17 00:00:00 2001 From: Charles Tapley Hoyt Date: Wed, 27 Sep 2023 10:36:34 +0200 Subject: [PATCH 2/3] Update util.py --- src/sssom/util.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/src/sssom/util.py b/src/sssom/util.py index 26e77591..619462e8 100644 --- a/src/sssom/util.py +++ b/src/sssom/util.py @@ -371,8 +371,9 @@ def get_row_based_on_hierarchy(df: pd.DataFrame): if not hierarchical_df.empty: return hierarchical_df + def rewire_sssom_table(df_rewire: pd.DataFrame, df_mapping: pd.DataFrame): - # 1. Standardise subject and object id columns using + # 1. Standardise subject and object id columns using # https://curies.readthedocs.io/en/latest/api/curies.Converter.html#curies.Converter.pd_standardize_curie # 2. Perform the rewiring # 3. Store some metadata in the "other" field? @@ -380,6 +381,7 @@ def rewire_sssom_table(df_rewire: pd.DataFrame, df_mapping: pd.DataFrame): result_df = ... return result_df + def assign_default_confidence( df: pd.DataFrame, ) -> Tuple[pd.DataFrame, pd.DataFrame]: From 40238416d60c383f0993de34bb7eeba6b2fdd6ed Mon Sep 17 00:00:00 2001 From: Charles Tapley Hoyt Date: Wed, 27 Sep 2023 10:41:53 +0200 Subject: [PATCH 3/3] Add standardize function --- src/sssom/util.py | 33 ++++++++++++++++++--------------- 1 file changed, 18 insertions(+), 15 deletions(-) diff --git a/src/sssom/util.py b/src/sssom/util.py index 619462e8..c07c5309 100644 --- a/src/sssom/util.py +++ b/src/sssom/util.py @@ -109,6 +109,23 @@ def clean_context(self) -> None: """Clean up the context.""" self.converter = curies.chain([_get_built_in_prefix_map(), self.converter]) + def _standardize_curie_or_iri(self, curie_or_iri: str) -> str: + """Standardize a CURIE or IRI, returning the original if not possible.""" + if is_iri(curie_or_iri): + return self.converter.standardize_uri(curie_or_iri) or curie_or_iri + if is_curie(curie_or_iri): + return self.converter.standardize_curie(curie_or_iri) or curie_or_iri + return curie_or_iri + + def standardize(self) -> None: + """Standardize this MSDF.""" + for column, values in _get_sssom_schema_object().dict["slots"].items(): + if values["range"] != "EntityReference": + continue + if column not in self.df.columns: + continue + self.df[column] = self.df[column].map(self._standardize_curie_or_iri) + def merge(self, *msdfs: "MappingSetDataFrame", inplace: bool = True) -> "MappingSetDataFrame": """Merge two MappingSetDataframes. @@ -1126,22 +1143,8 @@ def reconcile_prefix_and_data( converter = msdf.converter converter = curies.remap_curie_prefixes(converter, prefix_reconciliation["prefix_synonyms"]) converter = curies.rewire(converter, prefix_reconciliation["prefix_expansion_reconciliation"]) - - # TODO make this standardization code directly part of msdf after - # switching to native converter - def _upgrade(curie_or_iri: str) -> str: - if not is_iri(curie_or_iri) and is_curie(curie_or_iri): - return converter.standardize_curie(curie_or_iri) or curie_or_iri - return curie_or_iri - - for column, values in _get_sssom_schema_object().dict["slots"].items(): - if values["range"] != "EntityReference": - continue - if column not in msdf.df.columns: - continue - msdf.df[column] = msdf.df[column].map(_upgrade) - msdf.converter = converter + msdf.standardize() return msdf