Skip to content

Commit d69b47b

Browse files
authored
Merge pull request #366 from biothings/update-schema_org_version
Ensure DDE and biothings_schema Use Same schema.org Version Issue #359
2 parents 1f26726 + 33e7154 commit d69b47b

18 files changed

+541
-130
lines changed

.gitignore

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -119,7 +119,6 @@ venv.bak/
119119

120120
# Dev fixture
121121
biothings
122-
elasticsearch_dsl
123122
biothings_schema
124123

125124
# notebook tests

discovery/handlers/api/schema.py

Lines changed: 99 additions & 50 deletions
Original file line numberDiff line numberDiff line change
@@ -1,22 +1,22 @@
11
"""
2-
Schema APIs
2+
Schema APIs
33
4-
Logical document structure in this module:
5-
{
6-
"url": <schema_url>,
7-
"namespace": <schema_name>,
8-
"source": { ... } // only in returned docs for now
9-
}
4+
Logical document structure in this module:
5+
{
6+
"url": <schema_url>,
7+
"namespace": <schema_name>,
8+
"source": { ... } // only in returned docs for now
9+
}
1010
11-
Add read-only protection to cores schemas.
12-
Add authentication and permission control.
13-
Add convenience features to assist frontend rendering.
11+
Add read-only protection to cores schemas.
12+
Add authentication and permission control.
13+
Add convenience features to assist frontend rendering.
1414
1515
"""
1616

1717
import json
18-
import re
1918
import logging
19+
import re
2020
from datetime import date, datetime
2121

2222
import certifi
@@ -26,8 +26,8 @@
2626
from discovery.model.schema import Schema
2727
from discovery.notify import SchemaNotifier
2828
from discovery.registry import schemas
29-
from discovery.utils.adapters import SchemaAdapter
3029
from discovery.registry.common import NoEntityError
30+
from discovery.utils.adapters import SchemaAdapter
3131

3232
from .base import APIBaseHandler, authenticated, registryOperation
3333

@@ -88,7 +88,9 @@ def trace_root(klass):
8888
while index < len(queue):
8989
for parent_line_string in klass.get("parent_classes", []):
9090
parents = parent_line_string.split(", ")
91-
ids = [(parent.split(":")[0], parent) for parent in parents if ":" in parent][::-1]
91+
ids = [
92+
(parent.split(":")[0], parent) for parent in parents if ":" in parent
93+
][::-1]
9294
for _id in ids:
9395
klass = schemas.get_class(_id[0], _id[1])
9496
if klass and klass not in queue:
@@ -128,7 +130,10 @@ class SchemaRegistryHandler(APIBaseHandler):
128130
"verbose": {"type": bool, "default": False, "alias": ["v"]},
129131
"start": {"type": int, "default": 0, "alias": ["from", "skip"]},
130132
"size": {"type": int, "default": 10, "max": 100, "alias": "skip"},
131-
"context": {"type": bool, "default": True}, # consider not default in future
133+
"context": {
134+
"type": bool,
135+
"default": True,
136+
}, # consider not default in future
132137
"source": {"type": bool, "default": True},
133138
},
134139
}
@@ -225,7 +230,7 @@ def get(self, namespace=None, curie=None):
225230
_fields = [x.strip() for x in self.args.field.split(",")]
226231
if not ("_meta" in _fields or "_meta.url" in _fields):
227232
_fields.append("_meta") # always include _meta.url in the response
228-
if not ("_status" in _fields):
233+
if "_status" not in _fields:
229234
_fields.append("_status")
230235
hits = [
231236
to_api_doc_repr(schema)
@@ -364,7 +369,7 @@ class SchemaViewHandler(APIBaseHandler):
364369
}, # indicates the special target namespace of the schema, e.g. schema.org or bioschemas.
365370
"validation_merge": {
366371
"type": bool,
367-
"default": False
372+
"default": False,
368373
}, # whether to merge validation schemas from parent classes
369374
}
370375
}
@@ -392,31 +397,39 @@ async def get(self):
392397
doc = None
393398
if self.args.url:
394399
# load doc from url
395-
response = await AsyncHTTPClient().fetch(self.args.url, ca_certs=certifi.where())
400+
response = await AsyncHTTPClient().fetch(
401+
self.args.url, ca_certs=certifi.where()
402+
)
396403
doc = response.body
397404
elif self.request.body:
398405
# load doc from request body
399406
doc = self.request.body
400407
if doc:
401408
doc = json.loads(doc)
402409
# Use the validation_merge parameter from query args, defaults to False
403-
validation_merge = getattr(self.args, 'validation_merge', False)
404-
validator_options = {"validation_merge": validation_merge, "raise_on_validation_error": False}
410+
validation_merge = getattr(self.args, "validation_merge", False)
411+
validator_options = {
412+
"validation_merge": validation_merge,
413+
"raise_on_validation_error": False,
414+
}
415+
schema_org_version = schemas.get_schema_org_version()
416+
_kwargs = {
417+
"validator_options": validator_options,
418+
"schema_org_version": schema_org_version,
419+
}
405420
if self.args.ns:
406421
if self.args.ns == "schema.org":
407422
# do no load any base schemas
408-
schema = SchemaAdapter(
409-
doc, base_schema=[], validator_options=validator_options
410-
)
423+
schema = SchemaAdapter(doc, base_schema=[], **_kwargs)
411424
# elif self.args.ns == "bioschemas":
412425
# # do not load bioschemas, only schema.org
413426
# schema = SchemaAdapter(
414-
# doc, base_schema=["schema.org"], validator_options=validator_options
427+
# doc, base_schema=["schema.org"], **_kwargs
415428
# )
416429
else:
417-
schema = SchemaAdapter(doc, validator_options=validator_options)
430+
schema = SchemaAdapter(doc, **_kwargs)
418431
else:
419-
schema = SchemaAdapter(doc, validator_options=validator_options)
432+
schema = SchemaAdapter(doc, **_kwargs)
420433
else:
421434
self.finish({})
422435
return
@@ -459,7 +472,7 @@ class SchemaHandler(APIBaseHandler):
459472
"default": "json",
460473
"enum": ("json", "yaml", "html", "msgpack"),
461474
}
462-
}
475+
},
463476
}
464477

465478
def class_property_filter(self, metadata, class_id):
@@ -482,7 +495,9 @@ def class_property_filter(self, metadata, class_id):
482495
property_list.append(data_dict)
483496
break
484497
elif "schema:domainIncludes" not in data_dict:
485-
raise HTTPError(400, reason="No key 'schema:domainIncludes' found.")
498+
raise HTTPError(
499+
400, reason="No key 'schema:domainIncludes' found."
500+
)
486501
else:
487502
# odd case -- error exception case
488503
raise HTTPError(
@@ -493,7 +508,9 @@ def class_property_filter(self, metadata, class_id):
493508

494509
def get_context_matches(self, metadata, context_dict):
495510
matches = []
496-
pattern = re.compile(r"^([a-zA-Z0-9_-]+):([a-zA-Z0-9_-]+)$") # Regex to match STRINGA:STRINGB
511+
pattern = re.compile(
512+
r"^([a-zA-Z0-9_-]+):([a-zA-Z0-9_-]+)$"
513+
) # Regex to match STRINGA:STRINGB
497514

498515
def recursive_search(data):
499516
if isinstance(data, dict):
@@ -509,6 +526,7 @@ def recursive_search(data):
509526
prefix = match.group(1)
510527
if prefix in context_dict:
511528
matches.append(prefix)
529+
512530
recursive_search(metadata)
513531
return set(matches)
514532

@@ -541,7 +559,7 @@ def build_schema_org_context_dict(self, metadata):
541559
"vann": "http://purl.org/vocab/vann/",
542560
"void": "http://rdfs.org/ns/void#",
543561
"xsd": "http://www.w3.org/2001/XMLSchema#",
544-
"cvisb": "https://data.cvisb.org/schema"
562+
"cvisb": "https://data.cvisb.org/schema",
545563
}
546564

547565
matches = self.get_context_matches(metadata, context_dict)
@@ -550,17 +568,17 @@ def build_schema_org_context_dict(self, metadata):
550568

551569
def add_schema_org_property_to_list(self, data_dict, property_list):
552570
temp_dict = {
553-
"@id": data_dict['curie'],
571+
"@id": data_dict["curie"],
554572
"@type": "rdf:Property",
555-
"rdfs:comment": data_dict['description'],
556-
"rdfs:label": data_dict['label'],
557-
"schema:domainIncludes": [{"@id": value} for value in data_dict['domain']],
558-
"schema:rangeIncludes": [{"@id": value} for value in data_dict['range']],
573+
"rdfs:comment": data_dict["description"],
574+
"rdfs:label": data_dict["label"],
575+
"schema:domainIncludes": [{"@id": value} for value in data_dict["domain"]],
576+
"schema:rangeIncludes": [{"@id": value} for value in data_dict["range"]],
559577
}
560578
property_list.append(temp_dict)
561579

562580
def filter_schema_org_class_with_properties(self, metadata, property_list):
563-
class_dict={
581+
class_dict = {
564582
"@id": metadata["_id"].replace("schema::", "", 1),
565583
"@type": "rdfs:Class",
566584
"rdfs:comment": metadata["description"],
@@ -569,7 +587,7 @@ def filter_schema_org_class_with_properties(self, metadata, property_list):
569587
}
570588

571589
property_list.append(class_dict)
572-
for data_dict in metadata['properties']:
590+
for data_dict in metadata["properties"]:
573591
self.add_schema_org_property_to_list(data_dict, property_list)
574592
return property_list
575593

@@ -609,10 +627,16 @@ def graph_data_filter(self, metadata, curie, property_list):
609627
return property_list
610628

611629
def raise_404_not_found_error(self, curie):
612-
raise HTTPError(404, reason=f"The requested namespace or class, {curie}, does not exist in registry.")
630+
raise HTTPError(
631+
404,
632+
reason=f"The requested namespace or class, {curie}, does not exist in registry.",
633+
)
613634

614635
def raise_404_no_validation_error(self, curie):
615-
raise HTTPError(404, reason=f"The validation schema is not provided for this class or property: {curie}")
636+
raise HTTPError(
637+
404,
638+
reason=f"The validation schema is not provided for this class or property: {curie}",
639+
)
616640

617641
def get_curie(self, metadata, curie, ns):
618642
"""
@@ -635,23 +659,37 @@ def get_curie(self, metadata, curie, ns):
635659
if ns == "schema":
636660
try:
637661
klass = schemas.get_class("schema", curie_str)
638-
property_list = self.filter_schema_org_class_with_properties(klass, property_list)
662+
property_list = self.filter_schema_org_class_with_properties(
663+
klass, property_list
664+
)
639665
except NoEntityError as no_class_error:
640666
try:
641-
logger.info(f"Error retrieving schema class: {no_class_error}, attempting to retrieve property instead...")
667+
logger.info(
668+
f"Error retrieving schema class: {no_class_error}, attempting to retrieve property instead..."
669+
)
642670
property_label = curie_str.split(":")[1]
643-
klass=schemas.get_schema_org_property(property_label)
644-
property_list = self.filter_schema_org_property(klass, property_list)
671+
klass = schemas.get_schema_org_property(property_label)
672+
property_list = self.filter_schema_org_property(
673+
klass, property_list
674+
)
645675
except NoEntityError as no_property_error:
646-
logger.info(f"Error retrieving schema class: {no_property_error}, attempting to retrieve property instead...")
676+
logger.info(
677+
f"Error retrieving schema class: {no_property_error}, attempting to retrieve property instead..."
678+
)
647679
self.raise_404_not_found_error(curie)
648680
# set the context property for schema.org
649-
metadata["@context"] = self.build_schema_org_context_dict(property_list)
681+
metadata["@context"] = self.build_schema_org_context_dict(
682+
property_list
683+
)
650684
else:
651-
property_list = self.graph_data_filter(metadata, curie_str, property_list)
685+
property_list = self.graph_data_filter(
686+
metadata, curie_str, property_list
687+
)
652688
elif isinstance(curie, list):
653689
for curie_str in curie:
654-
property_list = self.graph_data_filter(metadata, curie_str, property_list)
690+
property_list = self.graph_data_filter(
691+
metadata, curie_str, property_list
692+
)
655693
else:
656694
raise HTTPError(400, reason="Unidentified curie input request")
657695

@@ -772,12 +810,18 @@ def get(self, curie=None, validation=None):
772810
# if no curie is given, throw error
773811
if curie is None:
774812
raise HTTPError(
775-
400, reason="A curie with a namespace prefix is required, i.e 'n3c:Dataset'"
813+
400,
814+
reason="A curie with a namespace prefix is required, i.e 'n3c:Dataset'",
776815
)
777816

778817
# curie: /{ns}
779818
if ":" not in curie and validation:
780-
raise(HTTPError(400, reason="A validation request must be for a class or property, not a namespace."))
819+
raise (
820+
HTTPError(
821+
400,
822+
reason="A validation request must be for a class or property, not a namespace.",
823+
)
824+
)
781825

782826
elif ":" not in curie:
783827
self.handle_namespace_request(curie)
@@ -790,7 +834,9 @@ def get(self, curie=None, validation=None):
790834
# check if request has too many ns fields
791835
ns_list = list(set([x.split(":")[0] for x in curie.split(",")]))
792836
if len(ns_list) > 1:
793-
raise HTTPError(400, reason="Too many schemas(namespaces) requested")
837+
raise HTTPError(
838+
400, reason="Too many schemas(namespaces) requested"
839+
)
794840
else:
795841
ns = curie.split(":")[0]
796842

@@ -810,6 +856,7 @@ def get(self, curie=None, validation=None):
810856
else:
811857
self.handle_class_request(curie, schema_metadata)
812858

859+
813860
class CoverageHandler(APIBaseHandler):
814861
"""
815862
Fetch - GET ./api/coverage
@@ -836,5 +883,7 @@ def get(self, curie=None):
836883
except (ValueError, KeyError) as error:
837884
raise HTTPError(400, reason=f"No coverage found because: {error}")
838885
except Exception as error:
839-
raise HTTPError(400, reason=f"Error retrieving coverage with exception {error}")
886+
raise HTTPError(
887+
400, reason=f"Error retrieving coverage with exception {error}"
888+
)
840889
self.finish(coverage)

discovery/model/__init__.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -12,7 +12,7 @@
1212
"""
1313
import os
1414

15-
from elasticsearch_dsl import connections
15+
from elasticsearch.dsl import connections
1616

1717
from .dataset import Dataset
1818
from .schema import Schema, SchemaClass

discovery/model/common.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
from elasticsearch_dsl import Document, InnerDoc, Keyword, MetaField, Object
1+
from elasticsearch.dsl import Document, InnerDoc, Keyword, MetaField, Object
22

33

44
class DiscoveryDoc(Document):

discovery/model/dataset.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,7 @@
77
"""
88
import hashlib
99

10-
from elasticsearch_dsl import Boolean, Date, InnerDoc, Keyword, Object, Text, normalizer
10+
from elasticsearch.dsl import Boolean, Date, InnerDoc, Keyword, Object, Text, normalizer
1111

1212
from .common import DiscoveryMeta, DiscoveryUserDoc
1313

discovery/model/schema.py

Lines changed: 3 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -9,17 +9,9 @@
99
import functools
1010
from datetime import datetime
1111

12-
from elasticsearch_dsl import (
13-
Boolean,
14-
Date,
15-
Index as ESIndex,
16-
InnerDoc,
17-
Integer,
18-
Keyword,
19-
Object,
20-
Text,
21-
)
22-
from elasticsearch_dsl.exceptions import ValidationException
12+
from elasticsearch.dsl import Boolean, Date, InnerDoc, Integer, Keyword, Object, Text
13+
from elasticsearch.dsl import Index as ESIndex
14+
from elasticsearch.dsl.exceptions import ValidationException
2315

2416
from .common import DiscoveryDoc, DiscoveryMeta, DiscoveryUserDoc
2517

discovery/pipeline.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,7 @@
88

99

1010
from biothings.web.query import ESQueryBuilder
11-
from elasticsearch_dsl import Search
11+
from elasticsearch.dsl import Search
1212

1313

1414
class DiscoveryQueryBuilder(ESQueryBuilder):

0 commit comments

Comments
 (0)