Skip to content

Commit ac2aa7f

Browse files
authored
Merge branch 'master' into create-pull-request/patch
2 parents 1ad00fe + 45e7fd9 commit ac2aa7f

File tree

10 files changed

+865
-895
lines changed

10 files changed

+865
-895
lines changed

poetry.lock

Lines changed: 789 additions & 869 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

pyproject.toml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -13,7 +13,7 @@ readme = "README.md"
1313
[tool.poetry.dependencies]
1414
python = ">=3.9,<4.0.0"
1515
click = ">=8.1.6"
16-
curies = ">=0.7.3"
16+
curies = ">=0.10.18"
1717
linkml-runtime = "^1.7.5"
1818
linkml = ">1.7.10"
1919
pandas = ">1.0.3"

src/sssom/io.py

Lines changed: 9 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,7 @@
1313
import yaml
1414
from curies import Converter
1515
from deprecation import deprecated
16+
from linkml.validator import ValidationReport
1617

1718
from sssom.validators import validate
1819

@@ -98,17 +99,23 @@ def parse_file(
9899
write_table(doc, output, embedded_mode)
99100

100101

101-
def validate_file(input_path: str, validation_types: List[SchemaValidationType]) -> None:
102+
def validate_file(
103+
input_path: str,
104+
validation_types: Optional[List[SchemaValidationType]] = None,
105+
fail_on_error: bool = True,
106+
) -> dict[SchemaValidationType, ValidationReport]:
102107
"""Validate the incoming SSSOM TSV according to the SSSOM specification.
103108
104109
:param input_path: The path to the input file in one of the legal formats, eg obographs, aligmentapi-xml
105110
:param validation_types: A list of validation types to run.
111+
:param fail_on_error: Should an exception be raised on error of _any_ validator?
112+
:returns: A dictionary from validation types to validation reports
106113
"""
107114
# Two things to check:
108115
# 1. All prefixes in the DataFrame are define in prefix_map
109116
# 2. All columns in the DataFrame abide by sssom-schema.
110117
msdf = parse_sssom_table(file_path=input_path)
111-
validate(msdf=msdf, validation_types=validation_types)
118+
return validate(msdf=msdf, validation_types=validation_types, fail_on_error=fail_on_error)
112119

113120

114121
def split_file(input_path: str, output_directory: Union[str, Path]) -> None:

src/sssom/parsers.py

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,6 @@
11
"""SSSOM parsers."""
22

3+
import gzip
34
import io
45
import itertools as itt
56
import json
@@ -96,6 +97,10 @@ def _open_input(input: Union[str, Path, TextIO]) -> io.StringIO:
9697
elif "\n" in input or "\r" in input:
9798
# It's string data
9899
return io.StringIO(input)
100+
elif input.endswith(".gz"):
101+
with gzip.open(input, "rt") as file:
102+
file_content = file.read()
103+
return io.StringIO(file_content)
99104
else:
100105
# It's a local file path
101106
with open(input, "r") as file:

src/sssom/rdf_util.py

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,7 @@
33
import logging
44
from typing import Any, Dict, List, Optional
55

6+
import curies
67
from linkml_runtime.utils.metamodelcore import URIorCURIE
78
from rdflib import Graph, URIRef
89
from sssom_schema import EntityReference, Mapping
@@ -26,7 +27,7 @@ def rewire_graph(
2627
if mdoc.mapping_set.mappings is None:
2728
raise TypeError
2829

29-
converter = mdoc.converter
30+
converter: curies.Converter = mdoc.converter
3031
rewire_map: Dict[URIorCURIE, URIorCURIE] = {}
3132
for m in mdoc.mapping_set.mappings:
3233
if not isinstance(m, Mapping):
@@ -42,8 +43,8 @@ def rewire_graph(
4243
curr_tgt = rewire_map[src]
4344
logging.info(f"Ambiguous: {src} -> {tgt} vs {curr_tgt}")
4445
if precedence:
45-
curr_pfx, _ = converter.parse_curie(curr_tgt)
46-
tgt_pfx, _ = converter.parse_curie(tgt)
46+
curr_pfx = converter.parse_curie(curr_tgt, strict=True).prefix
47+
tgt_pfx = converter.parse_curie(tgt, strict=True).prefix
4748
if tgt_pfx in precedence:
4849
if curr_pfx not in precedence or precedence.index(
4950
tgt_pfx

src/sssom/util.py

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -17,7 +17,7 @@
1717
import pandas as pd
1818
import validators
1919
import yaml
20-
from curies import Converter
20+
from curies import Converter, ReferenceTuple
2121
from jsonschema import ValidationError
2222
from linkml_runtime.linkml_model.types import Uriorcurie
2323
from sssom_schema import Mapping as SSSOM_Mapping
@@ -257,7 +257,7 @@ def clean_prefix_map(self, strict: bool = True) -> None:
257257
listed in the 'curie_map'.
258258
:raises ValueError: If prefixes absent in 'curie_map' and strict flag = True
259259
"""
260-
prefixes_in_table = get_prefixes_used_in_table(self.df, converter=self.converter)
260+
prefixes_in_table = get_prefixes_used_in_table(self.df)
261261
if self.metadata:
262262
prefixes_in_table.update(get_prefixes_used_in_metadata(self.metadata))
263263

@@ -1118,20 +1118,20 @@ def _is_iri(string: str) -> bool:
11181118
def get_prefix_from_curie(curie: str) -> str:
11191119
"""Get the prefix from a CURIE."""
11201120
if _is_curie(curie):
1121-
return curie.split(":")[0]
1121+
return ReferenceTuple.from_curie(curie).prefix
11221122
else:
11231123
return ""
11241124

11251125

1126-
def get_prefixes_used_in_table(df: pd.DataFrame, converter: Converter) -> Set[str]:
1126+
def get_prefixes_used_in_table(df: pd.DataFrame) -> Set[str]:
11271127
"""Get a list of prefixes used in CURIEs in key feature columns in a dataframe."""
11281128
prefixes = set(SSSOM_BUILT_IN_PREFIXES)
11291129
if df.empty:
11301130
return prefixes
11311131
sssom_schema_object = _get_sssom_schema_object()
11321132
entity_reference_slots = sssom_schema_object.entity_reference_slots & set(df.columns)
11331133
new_prefixes = {
1134-
converter.parse_curie(row).prefix
1134+
ReferenceTuple.from_curie(row).prefix
11351135
for col in entity_reference_slots
11361136
for row in df[col]
11371137
if not _is_iri(row) and _is_curie(row)

src/sssom/validators.py

Lines changed: 26 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
"""Validators."""
22

33
import logging
4-
from typing import Callable, List, Mapping
4+
from typing import Callable, List, Mapping, Optional
55

66
from jsonschema import ValidationError
77
from linkml.validator import ValidationReport, Validator
@@ -12,22 +12,29 @@
1212
from sssom.parsers import to_mapping_set_document
1313
from sssom.util import MappingSetDataFrame, get_all_prefixes
1414

15-
from .constants import SCHEMA_YAML, SchemaValidationType, _get_sssom_schema_object
15+
from .constants import (
16+
DEFAULT_VALIDATION_TYPES,
17+
SCHEMA_YAML,
18+
SchemaValidationType,
19+
_get_sssom_schema_object,
20+
)
1621

1722

1823
def validate(
1924
msdf: MappingSetDataFrame,
20-
validation_types: List[SchemaValidationType],
25+
validation_types: Optional[List[SchemaValidationType]] = None,
2126
fail_on_error: bool = True,
22-
) -> None:
27+
) -> dict[SchemaValidationType, ValidationReport]:
2328
"""Validate SSSOM files against `sssom-schema` using linkML's validator function.
2429
2530
:param msdf: MappingSetDataFrame.
2631
:param validation_types: SchemaValidationType
2732
:param fail_on_error: If true, throw an error when execution of a method has failed
33+
:returns: A dictionary from validation types to validation reports
2834
"""
29-
for vt in validation_types:
30-
VALIDATION_METHODS[vt](msdf, fail_on_error)
35+
if validation_types is None:
36+
validation_types = DEFAULT_VALIDATION_TYPES
37+
return {vt: VALIDATION_METHODS[vt](msdf, fail_on_error) for vt in validation_types}
3138

3239

3340
def print_linkml_report(report: ValidationReport, fail_on_error: bool = True):
@@ -88,7 +95,7 @@ def _clean_dict(d):
8895
return cleaned_dict
8996

9097

91-
def validate_json_schema(msdf: MappingSetDataFrame, fail_on_error: bool = True) -> None:
98+
def validate_json_schema(msdf: MappingSetDataFrame, fail_on_error: bool = True) -> ValidationReport:
9299
"""Validate JSON Schema using linkml's JsonSchemaDataValidator.
93100
94101
:param msdf: MappingSetDataFrame to eb validated.
@@ -106,9 +113,10 @@ def validate_json_schema(msdf: MappingSetDataFrame, fail_on_error: bool = True)
106113

107114
report = validator.validate(mapping_set_dict, "mapping set")
108115
print_linkml_report(report, fail_on_error)
116+
return report
109117

110118

111-
def validate_shacl(msdf: MappingSetDataFrame, fail_on_error: bool = True) -> None:
119+
def validate_shacl(msdf: MappingSetDataFrame, fail_on_error: bool = True) -> ValidationReport:
112120
"""Validate SCHACL file.
113121
114122
:param msdf: TODO: https://github.com/linkml/linkml/issues/850 .
@@ -118,7 +126,7 @@ def validate_shacl(msdf: MappingSetDataFrame, fail_on_error: bool = True) -> Non
118126
raise NotImplementedError
119127

120128

121-
def validate_sparql(msdf: MappingSetDataFrame, fail_on_error: bool = True) -> None:
129+
def validate_sparql(msdf: MappingSetDataFrame, fail_on_error: bool = True) -> ValidationReport:
122130
"""Validate SPARQL file.
123131
124132
:param msdf: MappingSetDataFrame
@@ -132,7 +140,9 @@ def validate_sparql(msdf: MappingSetDataFrame, fail_on_error: bool = True) -> No
132140
raise NotImplementedError
133141

134142

135-
def check_all_prefixes_in_curie_map(msdf: MappingSetDataFrame, fail_on_error: bool = True) -> None:
143+
def check_all_prefixes_in_curie_map(
144+
msdf: MappingSetDataFrame, fail_on_error: bool = True
145+
) -> ValidationReport:
136146
"""Check all `EntityReference` slots are mentioned in 'curie_map'.
137147
138148
:param msdf: MappingSetDataFrame
@@ -154,9 +164,12 @@ def check_all_prefixes_in_curie_map(msdf: MappingSetDataFrame, fail_on_error: bo
154164
)
155165
report = ValidationReport(results=validation_results)
156166
print_linkml_report(report, fail_on_error)
167+
return report
157168

158169

159-
def check_strict_curie_format(msdf: MappingSetDataFrame, fail_on_error: bool = True) -> None:
170+
def check_strict_curie_format(
171+
msdf: MappingSetDataFrame, fail_on_error: bool = True
172+
) -> ValidationReport:
160173
"""Check all `EntityReference` slots are formatted as unambiguous curies.
161174
162175
Implemented rules:
@@ -194,9 +207,10 @@ def check_strict_curie_format(msdf: MappingSetDataFrame, fail_on_error: bool = T
194207

195208
report = ValidationReport(results=validation_results)
196209
print_linkml_report(report, fail_on_error)
210+
return report
197211

198212

199-
VALIDATION_METHODS: Mapping[SchemaValidationType, Callable] = {
213+
VALIDATION_METHODS: Mapping[SchemaValidationType, Callable[..., ValidationReport]] = {
200214
SchemaValidationType.JsonSchema: validate_json_schema,
201215
SchemaValidationType.Shacl: validate_shacl,
202216
SchemaValidationType.Sparql: validate_sparql,

tests/test_utils.py

Lines changed: 17 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -146,6 +146,22 @@ def test_clean_prefix_map_strict(self):
146146
def test_clean_prefix_map_not_strict(self):
147147
"""Test clean prefix map with 'strict'=False."""
148148
msdf = parse_sssom_table(f"{data_dir}/test_clean_prefix.tsv")
149+
self.assertEqual(
150+
{
151+
"a": "http://example.org/a/",
152+
"b": "http://example.org/b/",
153+
"c": "http://example.org/c/",
154+
"d": "http://example.org/d/",
155+
"orcid": "https://orcid.org/",
156+
"owl": "http://www.w3.org/2002/07/owl#",
157+
"rdf": "http://www.w3.org/1999/02/22-rdf-syntax-ns#",
158+
"rdfs": "http://www.w3.org/2000/01/rdf-schema#",
159+
"semapv": "https://w3id.org/semapv/vocab/",
160+
"skos": "http://www.w3.org/2004/02/skos/core#",
161+
"sssom": "https://w3id.org/sssom/",
162+
},
163+
msdf.prefix_map,
164+
)
149165
original_curie_map = msdf.prefix_map
150166
self.assertEqual(
151167
{"a", "b", "c", "d", "orcid"}.union(SSSOM_BUILT_IN_PREFIXES),
@@ -202,7 +218,7 @@ def test_get_prefixes(self):
202218
metadata_path = data_dir.joinpath("enm_example.yml")
203219
metadata = yaml.safe_load(metadata_path.read_text())
204220
msdf = parse_sssom_table(path, meta=metadata)
205-
prefixes = get_prefixes_used_in_table(msdf.df, converter=msdf.converter)
221+
prefixes = get_prefixes_used_in_table(msdf.df)
206222
self.assertNotIn("http", prefixes)
207223
self.assertNotIn("https", prefixes)
208224
self.assertEqual(

tests/test_validate.py

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -28,7 +28,11 @@ def test_validate_json(self):
2828
Validate of the incoming file (basic.tsv) abides
2929
by the rules set by `sssom-schema`.
3030
"""
31-
self.assertIsNone(validate(self.correct_msdf1, self.validation_types))
31+
rv = validate(self.correct_msdf1, self.validation_types)
32+
self.assertIsNotNone(rv)
33+
self.assertIn(SchemaValidationType.JsonSchema, rv)
34+
json_validation = rv[SchemaValidationType.JsonSchema]
35+
self.assertEqual([], json_validation.results)
3236

3337
@unittest.skip(
3438
reason="""\

tox.ini

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -92,7 +92,10 @@ exclude =
9292

9393

9494
[testenv:mypy]
95-
deps = mypy
95+
deps =
96+
mypy
97+
types-PyYAML
98+
types-requests
9699
skip_install = true
97100
commands = mypy --install-types --non-interactive --ignore-missing-imports --implicit-optional src/sssom tests/
98101
description = Run the mypy tool to check static typing on the project.

0 commit comments

Comments
 (0)