Skip to content

Commit 3b6bd48

Browse files
authored
Merge pull request #192 from liip/feat/adapt-schema.org-rdf-profile-for-multilingual-metadata
Feat/adapt schema.org rdf profile for multilingual metadata
2 parents babb5b2 + 8bb676e commit 3b6bd48

File tree

7 files changed

+426
-2
lines changed

7 files changed

+426
-2
lines changed

ckanext/switzerland/dcat/profiles.py

Lines changed: 282 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -3,17 +3,20 @@
33
import time
44
from datetime import datetime
55

6+
import ckan.plugins.toolkit as tk
67
import rdflib
78
from ckan.lib.helpers import url_for
89
from rdflib import BNode, Literal, URIRef
910
from rdflib.namespace import RDF, RDFS, SKOS, XSD, Namespace
1011

11-
from ckanext.dcat.profiles import RDFProfile
12+
from ckanext.dcat.profiles import CleanedURIRef, RDFProfile, SchemaOrgProfile
1213
from ckanext.dcat.utils import resource_uri
1314
from ckanext.switzerland.helpers import (
1415
get_langs,
16+
get_publisher_dict_from_dataset,
1517
map_to_valid_format,
1618
ogdch_get_default_terms_of_use,
19+
uri_to_iri,
1720
)
1821

1922
log = logging.getLogger(__name__)
@@ -568,3 +571,281 @@ def _add_distribution_to_graph(self, dataset_ref, resource_dict, dataset_dict):
568571
def graph_from_catalog(self, catalog_dict, catalog_ref):
569572
g = self.g
570573
g.add((catalog_ref, RDF.type, DCAT.Catalog))
574+
575+
576+
class MultiLangProfile(RDFProfile):
577+
def _add_multilang_value(
578+
self, subject, predicate, key=None, data_dict=None, multilang_values=None
579+
):
580+
if not multilang_values and data_dict and key:
581+
multilang_values = data_dict.get(key)
582+
if multilang_values:
583+
try:
584+
for key, values in multilang_values.items():
585+
if values:
586+
# the values can be either a multilang-dict or they are
587+
# nested in another iterable (e.g. keywords)
588+
if not isinstance(values, list):
589+
values = [values]
590+
for value in values:
591+
if value:
592+
self.g.add(
593+
(subject, predicate, Literal(value, lang=key))
594+
)
595+
# if multilang_values is not iterable, it is simply added as a non-
596+
# translated Literal
597+
except AttributeError:
598+
self.g.add((subject, predicate, Literal(multilang_values)))
599+
600+
def _add_multilang_triples_from_dict(self, _dict, subject, items):
601+
for item in items:
602+
key, predicate, fallbacks, _type = item
603+
self._add_multilang_triple_from_dict(
604+
_dict, subject, predicate, key, fallbacks=fallbacks
605+
)
606+
607+
def _add_multilang_triple_from_dict(
608+
self, _dict, subject, predicate, key, fallbacks=None
609+
):
610+
"""
611+
Adds a new multilang triple to the graph with the provided parameters
612+
613+
The subject and predicate of the triple are passed as the relevant
614+
RDFLib objects (URIRef or BNode). The object is always a literal value,
615+
which is extracted from the dict using the provided key (see
616+
`_get_dict_value`).
617+
"""
618+
value = self._get_dict_value(_dict, key)
619+
620+
if value:
621+
self._add_multilang_value(subject, predicate, multilang_values=value)
622+
623+
624+
class SwissSchemaOrgProfile(SchemaOrgProfile, MultiLangProfile):
625+
def _basic_fields_graph(self, dataset_ref, dataset_dict):
626+
items = [
627+
("identifier", SCHEMA.identifier, None, Literal),
628+
("version", SCHEMA.version, ["dcat_version"], Literal),
629+
("issued", SCHEMA.datePublished, None, Literal),
630+
("modified", SCHEMA.dateModified, None, Literal),
631+
("author", SCHEMA.author, ["contact_name", "maintainer"], Literal),
632+
("url", SCHEMA.sameAs, None, Literal),
633+
]
634+
self._add_triples_from_dict(dataset_dict, dataset_ref, items)
635+
636+
items = [
637+
("title", SCHEMA.name, None, Literal),
638+
("description", SCHEMA.description, None, Literal),
639+
]
640+
self._add_multilang_triples_from_dict(dataset_dict, dataset_ref, items)
641+
642+
def _publisher_graph(self, dataset_ref, dataset_dict):
643+
if any(
644+
[
645+
self._get_dataset_value(dataset_dict, "publisher_uri"),
646+
self._get_dataset_value(dataset_dict, "publisher_name"),
647+
dataset_dict.get("organization"),
648+
]
649+
):
650+
publisher_uri, publisher_name = get_publisher_dict_from_dataset(
651+
dataset_dict.get("publisher")
652+
)
653+
if publisher_uri:
654+
publisher_details = CleanedURIRef(publisher_uri)
655+
else:
656+
publisher_details = BNode()
657+
658+
self.g.add((publisher_details, RDF.type, SCHEMA.Organization))
659+
self.g.add((dataset_ref, SCHEMA.publisher, publisher_details))
660+
self.g.add((dataset_ref, SCHEMA.sourceOrganization, publisher_details))
661+
662+
if not publisher_name and dataset_dict.get("organization"):
663+
publisher_name = dataset_dict["organization"]["title"]
664+
self._add_multilang_value(
665+
publisher_details, SCHEMA.name, multilang_values=publisher_name
666+
)
667+
else:
668+
self.g.add((publisher_details, SCHEMA.name, Literal(publisher_name)))
669+
670+
contact_point = BNode()
671+
self.g.add((publisher_details, SCHEMA.contactPoint, contact_point))
672+
673+
self.g.add((contact_point, SCHEMA.contactType, Literal("customer service")))
674+
675+
publisher_url = self._get_dataset_value(dataset_dict, "publisher_url")
676+
if not publisher_url and dataset_dict.get("organization"):
677+
publisher_url = dataset_dict["organization"].get(
678+
"url"
679+
) or tk.config.get("ckan.site_url", "")
680+
681+
self.g.add((contact_point, SCHEMA.url, Literal(publisher_url)))
682+
items = [
683+
(
684+
"publisher_email",
685+
SCHEMA.email,
686+
["contact_email", "maintainer_email", "author_email"],
687+
Literal,
688+
),
689+
(
690+
"publisher_name",
691+
SCHEMA.name,
692+
["contact_name", "maintainer", "author"],
693+
Literal,
694+
),
695+
]
696+
697+
self._add_triples_from_dict(dataset_dict, contact_point, items)
698+
699+
def _temporal_graph(self, dataset_ref, dataset_dict):
700+
# schema.org temporalCoverage only allows to specify one temporal
701+
# DCAT-AP Switzerland allows to specify multiple
702+
# for the mapping we always use the first one
703+
temporals = self._get_dataset_value(dataset_dict, "temporals")
704+
try:
705+
start = temporals[0].get("start_date")
706+
end = temporals[0].get("end_date")
707+
except (IndexError, KeyError, TypeError):
708+
# do not add temporals if there are none
709+
return
710+
if start or end:
711+
if start and end:
712+
self.g.add(
713+
(
714+
dataset_ref,
715+
SCHEMA.temporalCoverage,
716+
Literal(f"{start}/{end}"),
717+
)
718+
)
719+
elif start:
720+
self._add_date_triple(dataset_ref, SCHEMA.temporalCoverage, start)
721+
elif end:
722+
self._add_date_triple(dataset_ref, SCHEMA.temporalCoverage, end)
723+
724+
def _tags_graph(self, dataset_ref, dataset_dict):
725+
for tag in dataset_dict.get("keywords", []):
726+
items = [
727+
("keywords", SCHEMA.keywords, None, Literal),
728+
]
729+
self._add_multilang_triples_from_dict(dataset_dict, dataset_ref, items)
730+
731+
def _distribution_basic_fields_graph(self, distribution, resource_dict):
732+
items = [
733+
("issued", SCHEMA.datePublished, None, Literal),
734+
("modified", SCHEMA.dateModified, None, Literal),
735+
]
736+
737+
self._add_triples_from_dict(resource_dict, distribution, items)
738+
739+
items = [
740+
("title", SCHEMA.name, None, Literal),
741+
("description", SCHEMA.description, None, Literal),
742+
]
743+
self._add_multilang_triples_from_dict(resource_dict, distribution, items)
744+
745+
def contact_details(self, dataset_dict, dataset_ref, g):
746+
# Contact details used by graph_from_dataset
747+
if dataset_dict.get("contact_points"):
748+
contact_points = self._get_dataset_value(dataset_dict, "contact_points")
749+
for contact_point in contact_points:
750+
if not contact_point.get("email") or not contact_point.get("name"):
751+
continue
752+
contact_details = BNode()
753+
754+
contact_point_email = f"mailto:{contact_point['email']}"
755+
contact_point_name = contact_point["name"]
756+
757+
g.add((contact_details, RDF.type, VCARD.Organization))
758+
g.add((contact_details, VCARD.hasEmail, URIRef(contact_point_email)))
759+
g.add((contact_details, VCARD.fn, Literal(contact_point_name)))
760+
761+
g.add((dataset_ref, SCHEMA.contactPoint, contact_details))
762+
763+
return g
764+
765+
def download_access_url(self, resource_dict, distribution, g):
766+
# Download URL & Access URL used by graph_from_dataset
767+
download_url = resource_dict.get("download_url")
768+
if download_url:
769+
try:
770+
download_url = uri_to_iri(download_url)
771+
g.add((distribution, SCHEMA.downloadURL, URIRef(download_url)))
772+
except ValueError:
773+
# only add valid URL
774+
pass
775+
776+
url = resource_dict.get("url")
777+
if (url and not download_url) or (url and url != download_url):
778+
try:
779+
url = uri_to_iri(url)
780+
g.add((distribution, SCHEMA.accessURL, URIRef(url)))
781+
except ValueError:
782+
# only add valid URL
783+
pass
784+
elif download_url:
785+
g.add((distribution, SCHEMA.accessURL, URIRef(download_url)))
786+
787+
return g
788+
789+
def graph_from_dataset(self, dataset_dict, dataset_ref):
790+
g = self.g
791+
792+
# Contact details
793+
self.contact_details(dataset_dict, dataset_ref, g)
794+
795+
# Resources
796+
for resource_dict in dataset_dict.get("resources", []):
797+
distribution = URIRef(resource_uri(resource_dict))
798+
799+
g.add((dataset_ref, SCHEMA.distribution, distribution))
800+
g.add((distribution, RDF.type, SCHEMA.Distribution))
801+
802+
# Simple values
803+
items = [
804+
("status", ADMS.status, None, Literal),
805+
("coverage", DCT.coverage, None, Literal),
806+
("identifier", DCT.identifier, None, Literal),
807+
("spatial", DCT.spatial, None, Literal),
808+
]
809+
810+
self._add_triples_from_dict(resource_dict, distribution, items)
811+
812+
self._add_multilang_value(
813+
distribution, DCT.title, "display_name", resource_dict
814+
)
815+
self._add_multilang_value(
816+
distribution, DCT.description, "description", resource_dict
817+
)
818+
819+
# Language
820+
languages = resource_dict.get("language", [])
821+
for lang in languages:
822+
if "http://publications.europa.eu/resource/authority" in lang:
823+
# Already a valid EU language URI
824+
g.add((distribution, DCT.language, URIRef(lang)))
825+
else:
826+
uri = LANGUAGE_URI_MAPPING.get(lang, None)
827+
if uri:
828+
g.add((distribution, DCT.language, URIRef(uri)))
829+
else:
830+
log.debug(f"Language '{lang}' not found in language_uri_map")
831+
832+
# Download URL & Access URL
833+
self.download_access_url(resource_dict, distribution, g)
834+
835+
# Dates
836+
items = [
837+
("issued", DCT.issued, None, Literal),
838+
("modified", DCT.modified, None, Literal),
839+
]
840+
841+
self._add_date_triples_from_dict(resource_dict, distribution, items)
842+
# ByteSize
843+
if resource_dict.get("byte_size"):
844+
g.add(
845+
(distribution, SCHEMA.byteSize, Literal(resource_dict["byte_size"]))
846+
)
847+
848+
super(SwissSchemaOrgProfile, self).graph_from_dataset(dataset_dict, dataset_ref)
849+
850+
def parse_dataset(self, dataset_dict, dataset_ref):
851+
super(SwissSchemaOrgProfile, self).parse_dataset(dataset_dict, dataset_ref)

ckanext/switzerland/helpers.py

Lines changed: 30 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5,9 +5,11 @@
55
import unicodedata
66
from collections import OrderedDict, defaultdict
77
from datetime import datetime
8+
from urllib.parse import urlparse
89
from zoneinfo import ZoneInfo
910

1011
import ckan.plugins.toolkit as tk
12+
import iribaker
1113
import requests
1214
from ckan.common import _
1315
from ckan.lib.helpers import _link_to
@@ -684,3 +686,31 @@ def ogdch_get_default_terms_of_use():
684686
"name": _("Terms of use opentransportdata.swiss"),
685687
"url": f"https://opentransportdata.swiss/{ _('en/terms-of-use') }",
686688
}
689+
690+
691+
def get_publisher_dict_from_dataset(publisher):
692+
if not publisher:
693+
return None, None
694+
if not isinstance(publisher, dict):
695+
publisher = json.loads(publisher)
696+
return publisher.get("url"), publisher.get("name")
697+
698+
699+
def uri_to_iri(uri):
700+
"""
701+
convert URI to IRI (used for RDF)
702+
this function also validates the URI and throws a ValueError if the
703+
provided URI is invalid
704+
"""
705+
if not uri:
706+
raise ValueError("Provided URI is empty or None")
707+
708+
result = urlparse(uri)
709+
if not result.scheme or not result.netloc or result.netloc == "-":
710+
raise ValueError("Provided URI does not have a valid schema or netloc")
711+
712+
try:
713+
iri = iribaker.to_iri(uri)
714+
return iri
715+
except Exception as e:
716+
raise ValueError(f"Provided URI can't be converted to IRI: {e}")
Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,8 @@
1+
{% ckan_extends %}
2+
3+
{% block structured_data %}
4+
<!-- Structured data -->
5+
<script type="application/ld+json">
6+
{{ h.structured_data(pkg, ["swiss_schemaorg"]) | safe }}
7+
</script>
8+
{% endblock %}

0 commit comments

Comments
 (0)