Skip to content

Commit 29d8972

Browse files
authored
Get dict of schema (#219)
* corrected sssom.get_dict_from_mapping * tests fixed * reformatted * test values changes * avoided .astype(str) for DataFrames * clean-up * formatted * duplicate drop * reverting concat back to merge * made NaN assignment dynamic * fixed dataframe cell value deduction if None * formatted
1 parent f179b3f commit 29d8972

File tree

6 files changed

+82
-19
lines changed

6 files changed

+82
-19
lines changed

setup.cfg

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -144,6 +144,7 @@ ignore =
144144
E501 # Line length
145145
W503 # Line break before binary operator (flake8 is wrong)
146146
S408 # don't worry about unsafe xml
147+
S324 # Use of weak MD4, MD5, or SHA1 hash for security. Consider usedforsecurity=False
147148
S318 # don't worry about unsafe xml
148149
S310 # TODO remove this later and switch to using requests
149150
B018 # This is 'useless' statements which are new atm.

sssom/parsers.py

Lines changed: 7 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,8 @@
1818
from linkml_runtime.loaders.json_loader import JSONLoader
1919
from rdflib import Graph, URIRef
2020

21+
from sssom.constants import MAPPING_SET_SLOTS, MAPPING_SLOTS
22+
2123
from .context import (
2224
DEFAULT_LICENSE,
2325
DEFAULT_MAPPING_SET_ID,
@@ -75,7 +77,6 @@ def read_sssom_table(
7577
meta = sssom_metadata
7678

7779
prefix_map, meta = _get_prefix_map_and_metadata(prefix_map=prefix_map, meta=meta)
78-
7980
msdf = from_sssom_dataframe(df, prefix_map=prefix_map, meta=meta)
8081
return msdf
8182

@@ -188,21 +189,23 @@ def _get_mdict_ms_and_bad_attrs(
188189
) -> Tuple[dict, MappingSet, Counter]:
189190

190191
mdict = {}
192+
191193
for k, v in row.items():
192194
if v and v == v:
193195
ok = False
194196
if k:
195197
k = str(k)
196198
v = _address_multivalued_slot(k, v)
197-
if hasattr(Mapping, k):
199+
# if hasattr(Mapping, k):
200+
if k in MAPPING_SLOTS:
198201
mdict[k] = v
199202
ok = True
200-
if hasattr(MappingSet, k):
203+
# if hasattr(MappingSet, k):
204+
if k in MAPPING_SET_SLOTS:
201205
ms[k] = v
202206
ok = True
203207
if not ok:
204208
bad_attrs[k] += 1
205-
206209
return (mdict, ms, bad_attrs)
207210

208211

@@ -248,7 +251,6 @@ def from_sssom_dataframe(
248251
for _, row in df.iterrows():
249252
mdict, ms, bad_attrs = _get_mdict_ms_and_bad_attrs(row, ms, bad_attrs)
250253
mlist.append(_prepare_mapping(Mapping(**mdict)))
251-
252254
for k, v in bad_attrs.most_common():
253255
logging.warning(f"No attr for {k} [{v} instances]")
254256
# the autogenerated code's type annotations are _really_ messy. This is in fact okay,

sssom/util.py

Lines changed: 42 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -35,7 +35,7 @@
3535
from .context import SSSOM_URI_PREFIX, get_default_metadata, get_jsonld_context
3636
from .internal_context import multivalued_slots
3737
from .sssom_datamodel import Mapping as SSSOM_Mapping
38-
from .sssom_datamodel import MatchTypeEnum, PredicateModifierEnum, slots
38+
from .sssom_datamodel import slots
3939
from .sssom_document import MappingSetDocument
4040
from .typehints import Metadata, MetadataType, PrefixMap
4141

@@ -831,17 +831,49 @@ def get_dict_from_mapping(map_obj: Union[Any, Dict[Any, Any], SSSOM_Mapping]) ->
831831
:return: Dictionary
832832
"""
833833
map_dict = {}
834+
slots_with_double_as_range = [
835+
s
836+
for s in SCHEMA_DICT["slots"].keys()
837+
if SCHEMA_DICT["slots"][s]["range"] == "double"
838+
]
834839
for property in map_obj:
835-
if isinstance(map_obj[property], list):
836-
map_dict[property] = "|".join(
837-
enum_value.code.text
838-
for enum_value in map_obj[property]
839-
if type(enum_value).__name__ == MatchTypeEnum._defn.name
840-
)
841-
elif type(map_obj[property]).__name__ == PredicateModifierEnum._defn.name:
842-
map_dict[property] = map_obj[property].code.text
840+
if map_obj[property] is not None:
841+
if isinstance(map_obj[property], list):
842+
# IF object is an enum
843+
if (
844+
SCHEMA_DICT["slots"][property]["range"]
845+
in SCHEMA_DICT["enums"].keys()
846+
):
847+
# IF object is a multivalued enum
848+
if SCHEMA_DICT["slots"][property]["multivalued"]:
849+
map_dict[property] = "|".join(
850+
enum_value.code.text for enum_value in map_obj[property]
851+
)
852+
# If object is NOT multivalued BUT an enum.
853+
else:
854+
map_dict[property] = map_obj[property].code.text
855+
# IF object is NOT an enum but a list
856+
else:
857+
map_dict[property] = "|".join(
858+
enum_value for enum_value in map_obj[property]
859+
)
860+
# IF object NOT a list
861+
else:
862+
# IF object is an enum
863+
if (
864+
SCHEMA_DICT["slots"][property]["range"]
865+
in SCHEMA_DICT["enums"].keys()
866+
):
867+
map_dict[property] = map_obj[property].code.text
868+
else:
869+
map_dict[property] = map_obj[property]
843870
else:
844-
map_dict[property] = map_obj[property]
871+
# IF map_obj[property] is None:
872+
if property in slots_with_double_as_range:
873+
map_dict[property] = np.nan
874+
else:
875+
map_dict[property] = ""
876+
845877
return map_dict
846878

847879

tests/test_config.yaml

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,7 @@ tests:
22
- filename: "basic.tsv"
33
inputformat: "tsv"
44
multiple_input: True
5-
ct_json_elements: 11
5+
ct_json_elements: 13
66
ct_data_frame_rows: 141
77
ct_graph_queries_owl:
88
query_count_equivalent_classes: 90
@@ -11,7 +11,7 @@ tests:
1111
- filename: "basic2.tsv"
1212
inputformat: "tsv"
1313
multiple_input: True
14-
ct_json_elements: 11
14+
ct_json_elements: 13
1515
ct_data_frame_rows: 126
1616
ct_graph_queries_owl:
1717
query_count_equivalent_classes: 79

tests/test_parsers.py

Lines changed: 30 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,10 +1,12 @@
11
"""Tests for parsers."""
22

33
import json
4+
import math
45
import os
56
import unittest
67
from xml.dom import minidom
78

9+
import numpy as np
810
import pandas as pd
911
import yaml
1012
from rdflib import Graph
@@ -176,9 +178,36 @@ def test_piped_element_to_list(self):
176178
input_path = os.path.join(test_data_dir, "basic.tsv")
177179
msdf = read_sssom_table(input_path)
178180
df = msdf.df
179-
msdf.df = df[df["match_type"].str.contains("\\|")].reset_index()
181+
msdf.df = df[df["match_type"].str.contains("\\|", na=False)].reset_index()
180182
old_match_type = msdf.df["match_type"]
181183
msdoc = to_mapping_set_document(msdf)
182184
new_msdf = to_mapping_set_dataframe(msdoc)
183185
new_match_type = new_msdf.df["match_type"]
184186
self.assertTrue(old_match_type.equals(new_match_type))
187+
188+
def test_read_sssom_table(self):
189+
"""Test read SSSOM method to validate import of all columns."""
190+
input_path = os.path.join(test_data_dir, "basic3.tsv")
191+
msdf = read_sssom_table(input_path)
192+
imported_df = pd.read_csv(input_path, comment="#", sep="\t")
193+
self.assertEqual(set(imported_df.columns), set(msdf.df.columns))
194+
list_cols = [
195+
"subject_match_field",
196+
"object_match_field",
197+
"match_string",
198+
"match_type",
199+
]
200+
for idx, row in msdf.df.iterrows():
201+
for k, v in row.iteritems():
202+
if v == "":
203+
self.assertTrue(math.isnan(imported_df.iloc[idx][k]))
204+
else:
205+
if k not in list_cols:
206+
if v is np.nan:
207+
self.assertTrue(imported_df.iloc[idx][k] is v)
208+
else:
209+
self.assertEqual(imported_df.iloc[idx][k], v)
210+
elif k == "match_type":
211+
self.assertEqual(imported_df.iloc[idx][k], v)
212+
else:
213+
self.assertEqual(imported_df.iloc[idx][k], v)

tests/test_reconcile.py

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -29,7 +29,6 @@ def test_merge(self):
2929
"""Test merging two tables."""
3030
msdf1 = read_sssom_table(data_dir / "basic.tsv")
3131
msdf2 = read_sssom_table(data_dir / "basic2.tsv")
32-
3332
merged_msdf = merge_msdf(msdf1, msdf2)
3433

3534
self.assertEqual(71, len(merged_msdf.df))

0 commit comments

Comments
 (0)