|
3 | 3 | import time |
4 | 4 | from datetime import datetime |
5 | 5 |
|
| 6 | +import ckan.plugins.toolkit as tk |
6 | 7 | import rdflib |
7 | 8 | from ckan.lib.helpers import url_for |
8 | 9 | from rdflib import BNode, Literal, URIRef |
9 | 10 | from rdflib.namespace import RDF, RDFS, SKOS, XSD, Namespace |
10 | 11 |
|
11 | | -from ckanext.dcat.profiles import RDFProfile |
| 12 | +from ckanext.dcat.profiles import CleanedURIRef, RDFProfile, SchemaOrgProfile |
12 | 13 | from ckanext.dcat.utils import resource_uri |
13 | 14 | from ckanext.switzerland.helpers import ( |
14 | 15 | get_langs, |
| 16 | + get_publisher_dict_from_dataset, |
15 | 17 | map_to_valid_format, |
16 | 18 | ogdch_get_default_terms_of_use, |
| 19 | + uri_to_iri, |
17 | 20 | ) |
18 | 21 |
|
19 | 22 | log = logging.getLogger(__name__) |
@@ -568,3 +571,281 @@ def _add_distribution_to_graph(self, dataset_ref, resource_dict, dataset_dict): |
568 | 571 | def graph_from_catalog(self, catalog_dict, catalog_ref): |
569 | 572 | g = self.g |
570 | 573 | g.add((catalog_ref, RDF.type, DCAT.Catalog)) |
| 574 | + |
| 575 | + |
| 576 | +class MultiLangProfile(RDFProfile): |
| 577 | + def _add_multilang_value( |
| 578 | + self, subject, predicate, key=None, data_dict=None, multilang_values=None |
| 579 | + ): |
| 580 | + if not multilang_values and data_dict and key: |
| 581 | + multilang_values = data_dict.get(key) |
| 582 | + if multilang_values: |
| 583 | + try: |
| 584 | + for key, values in multilang_values.items(): |
| 585 | + if values: |
| 586 | + # the values can be either a multilang-dict or they are |
| 587 | + # nested in another iterable (e.g. keywords) |
| 588 | + if not isinstance(values, list): |
| 589 | + values = [values] |
| 590 | + for value in values: |
| 591 | + if value: |
| 592 | + self.g.add( |
| 593 | + (subject, predicate, Literal(value, lang=key)) |
| 594 | + ) |
| 595 | + # if multilang_values is not iterable, it is simply added as a non- |
| 596 | + # translated Literal |
| 597 | + except AttributeError: |
| 598 | + self.g.add((subject, predicate, Literal(multilang_values))) |
| 599 | + |
| 600 | + def _add_multilang_triples_from_dict(self, _dict, subject, items): |
| 601 | + for item in items: |
| 602 | + key, predicate, fallbacks, _type = item |
| 603 | + self._add_multilang_triple_from_dict( |
| 604 | + _dict, subject, predicate, key, fallbacks=fallbacks |
| 605 | + ) |
| 606 | + |
| 607 | + def _add_multilang_triple_from_dict( |
| 608 | + self, _dict, subject, predicate, key, fallbacks=None |
| 609 | + ): |
| 610 | + """ |
| 611 | + Adds a new multilang triple to the graph with the provided parameters |
| 612 | +
|
| 613 | + The subject and predicate of the triple are passed as the relevant |
| 614 | + RDFLib objects (URIRef or BNode). The object is always a literal value, |
| 615 | + which is extracted from the dict using the provided key (see |
| 616 | + `_get_dict_value`). |
| 617 | + """ |
| 618 | + value = self._get_dict_value(_dict, key) |
| 619 | + |
| 620 | + if value: |
| 621 | + self._add_multilang_value(subject, predicate, multilang_values=value) |
| 622 | + |
| 623 | + |
| 624 | +class SwissSchemaOrgProfile(SchemaOrgProfile, MultiLangProfile): |
| 625 | + def _basic_fields_graph(self, dataset_ref, dataset_dict): |
| 626 | + items = [ |
| 627 | + ("identifier", SCHEMA.identifier, None, Literal), |
| 628 | + ("version", SCHEMA.version, ["dcat_version"], Literal), |
| 629 | + ("issued", SCHEMA.datePublished, None, Literal), |
| 630 | + ("modified", SCHEMA.dateModified, None, Literal), |
| 631 | + ("author", SCHEMA.author, ["contact_name", "maintainer"], Literal), |
| 632 | + ("url", SCHEMA.sameAs, None, Literal), |
| 633 | + ] |
| 634 | + self._add_triples_from_dict(dataset_dict, dataset_ref, items) |
| 635 | + |
| 636 | + items = [ |
| 637 | + ("title", SCHEMA.name, None, Literal), |
| 638 | + ("description", SCHEMA.description, None, Literal), |
| 639 | + ] |
| 640 | + self._add_multilang_triples_from_dict(dataset_dict, dataset_ref, items) |
| 641 | + |
| 642 | + def _publisher_graph(self, dataset_ref, dataset_dict): |
| 643 | + if any( |
| 644 | + [ |
| 645 | + self._get_dataset_value(dataset_dict, "publisher_uri"), |
| 646 | + self._get_dataset_value(dataset_dict, "publisher_name"), |
| 647 | + dataset_dict.get("organization"), |
| 648 | + ] |
| 649 | + ): |
| 650 | + publisher_uri, publisher_name = get_publisher_dict_from_dataset( |
| 651 | + dataset_dict.get("publisher") |
| 652 | + ) |
| 653 | + if publisher_uri: |
| 654 | + publisher_details = CleanedURIRef(publisher_uri) |
| 655 | + else: |
| 656 | + publisher_details = BNode() |
| 657 | + |
| 658 | + self.g.add((publisher_details, RDF.type, SCHEMA.Organization)) |
| 659 | + self.g.add((dataset_ref, SCHEMA.publisher, publisher_details)) |
| 660 | + self.g.add((dataset_ref, SCHEMA.sourceOrganization, publisher_details)) |
| 661 | + |
| 662 | + if not publisher_name and dataset_dict.get("organization"): |
| 663 | + publisher_name = dataset_dict["organization"]["title"] |
| 664 | + self._add_multilang_value( |
| 665 | + publisher_details, SCHEMA.name, multilang_values=publisher_name |
| 666 | + ) |
| 667 | + else: |
| 668 | + self.g.add((publisher_details, SCHEMA.name, Literal(publisher_name))) |
| 669 | + |
| 670 | + contact_point = BNode() |
| 671 | + self.g.add((publisher_details, SCHEMA.contactPoint, contact_point)) |
| 672 | + |
| 673 | + self.g.add((contact_point, SCHEMA.contactType, Literal("customer service"))) |
| 674 | + |
| 675 | + publisher_url = self._get_dataset_value(dataset_dict, "publisher_url") |
| 676 | + if not publisher_url and dataset_dict.get("organization"): |
| 677 | + publisher_url = dataset_dict["organization"].get( |
| 678 | + "url" |
| 679 | + ) or tk.config.get("ckan.site_url", "") |
| 680 | + |
| 681 | + self.g.add((contact_point, SCHEMA.url, Literal(publisher_url))) |
| 682 | + items = [ |
| 683 | + ( |
| 684 | + "publisher_email", |
| 685 | + SCHEMA.email, |
| 686 | + ["contact_email", "maintainer_email", "author_email"], |
| 687 | + Literal, |
| 688 | + ), |
| 689 | + ( |
| 690 | + "publisher_name", |
| 691 | + SCHEMA.name, |
| 692 | + ["contact_name", "maintainer", "author"], |
| 693 | + Literal, |
| 694 | + ), |
| 695 | + ] |
| 696 | + |
| 697 | + self._add_triples_from_dict(dataset_dict, contact_point, items) |
| 698 | + |
| 699 | + def _temporal_graph(self, dataset_ref, dataset_dict): |
| 700 | + # schema.org temporalCoverage only allows to specify one temporal |
| 701 | + # DCAT-AP Switzerland allows to specify multiple |
| 702 | + # for the mapping we always use the first one |
| 703 | + temporals = self._get_dataset_value(dataset_dict, "temporals") |
| 704 | + try: |
| 705 | + start = temporals[0].get("start_date") |
| 706 | + end = temporals[0].get("end_date") |
| 707 | + except (IndexError, KeyError, TypeError): |
| 708 | + # do not add temporals if there are none |
| 709 | + return |
| 710 | + if start or end: |
| 711 | + if start and end: |
| 712 | + self.g.add( |
| 713 | + ( |
| 714 | + dataset_ref, |
| 715 | + SCHEMA.temporalCoverage, |
| 716 | + Literal(f"{start}/{end}"), |
| 717 | + ) |
| 718 | + ) |
| 719 | + elif start: |
| 720 | + self._add_date_triple(dataset_ref, SCHEMA.temporalCoverage, start) |
| 721 | + elif end: |
| 722 | + self._add_date_triple(dataset_ref, SCHEMA.temporalCoverage, end) |
| 723 | + |
| 724 | + def _tags_graph(self, dataset_ref, dataset_dict): |
| 725 | + for tag in dataset_dict.get("keywords", []): |
| 726 | + items = [ |
| 727 | + ("keywords", SCHEMA.keywords, None, Literal), |
| 728 | + ] |
| 729 | + self._add_multilang_triples_from_dict(dataset_dict, dataset_ref, items) |
| 730 | + |
| 731 | + def _distribution_basic_fields_graph(self, distribution, resource_dict): |
| 732 | + items = [ |
| 733 | + ("issued", SCHEMA.datePublished, None, Literal), |
| 734 | + ("modified", SCHEMA.dateModified, None, Literal), |
| 735 | + ] |
| 736 | + |
| 737 | + self._add_triples_from_dict(resource_dict, distribution, items) |
| 738 | + |
| 739 | + items = [ |
| 740 | + ("title", SCHEMA.name, None, Literal), |
| 741 | + ("description", SCHEMA.description, None, Literal), |
| 742 | + ] |
| 743 | + self._add_multilang_triples_from_dict(resource_dict, distribution, items) |
| 744 | + |
| 745 | + def contact_details(self, dataset_dict, dataset_ref, g): |
| 746 | + # Contact details used by graph_from_dataset |
| 747 | + if dataset_dict.get("contact_points"): |
| 748 | + contact_points = self._get_dataset_value(dataset_dict, "contact_points") |
| 749 | + for contact_point in contact_points: |
| 750 | + if not contact_point.get("email") or not contact_point.get("name"): |
| 751 | + continue |
| 752 | + contact_details = BNode() |
| 753 | + |
| 754 | + contact_point_email = f"mailto:{contact_point['email']}" |
| 755 | + contact_point_name = contact_point["name"] |
| 756 | + |
| 757 | + g.add((contact_details, RDF.type, VCARD.Organization)) |
| 758 | + g.add((contact_details, VCARD.hasEmail, URIRef(contact_point_email))) |
| 759 | + g.add((contact_details, VCARD.fn, Literal(contact_point_name))) |
| 760 | + |
| 761 | + g.add((dataset_ref, SCHEMA.contactPoint, contact_details)) |
| 762 | + |
| 763 | + return g |
| 764 | + |
| 765 | + def download_access_url(self, resource_dict, distribution, g): |
| 766 | + # Download URL & Access URL used by graph_from_dataset |
| 767 | + download_url = resource_dict.get("download_url") |
| 768 | + if download_url: |
| 769 | + try: |
| 770 | + download_url = uri_to_iri(download_url) |
| 771 | + g.add((distribution, SCHEMA.downloadURL, URIRef(download_url))) |
| 772 | + except ValueError: |
| 773 | + # only add valid URL |
| 774 | + pass |
| 775 | + |
| 776 | + url = resource_dict.get("url") |
| 777 | + if (url and not download_url) or (url and url != download_url): |
| 778 | + try: |
| 779 | + url = uri_to_iri(url) |
| 780 | + g.add((distribution, SCHEMA.accessURL, URIRef(url))) |
| 781 | + except ValueError: |
| 782 | + # only add valid URL |
| 783 | + pass |
| 784 | + elif download_url: |
| 785 | + g.add((distribution, SCHEMA.accessURL, URIRef(download_url))) |
| 786 | + |
| 787 | + return g |
| 788 | + |
| 789 | + def graph_from_dataset(self, dataset_dict, dataset_ref): |
| 790 | + g = self.g |
| 791 | + |
| 792 | + # Contact details |
| 793 | + self.contact_details(dataset_dict, dataset_ref, g) |
| 794 | + |
| 795 | + # Resources |
| 796 | + for resource_dict in dataset_dict.get("resources", []): |
| 797 | + distribution = URIRef(resource_uri(resource_dict)) |
| 798 | + |
| 799 | + g.add((dataset_ref, SCHEMA.distribution, distribution)) |
| 800 | + g.add((distribution, RDF.type, SCHEMA.Distribution)) |
| 801 | + |
| 802 | + # Simple values |
| 803 | + items = [ |
| 804 | + ("status", ADMS.status, None, Literal), |
| 805 | + ("coverage", DCT.coverage, None, Literal), |
| 806 | + ("identifier", DCT.identifier, None, Literal), |
| 807 | + ("spatial", DCT.spatial, None, Literal), |
| 808 | + ] |
| 809 | + |
| 810 | + self._add_triples_from_dict(resource_dict, distribution, items) |
| 811 | + |
| 812 | + self._add_multilang_value( |
| 813 | + distribution, DCT.title, "display_name", resource_dict |
| 814 | + ) |
| 815 | + self._add_multilang_value( |
| 816 | + distribution, DCT.description, "description", resource_dict |
| 817 | + ) |
| 818 | + |
| 819 | + # Language |
| 820 | + languages = resource_dict.get("language", []) |
| 821 | + for lang in languages: |
| 822 | + if "http://publications.europa.eu/resource/authority" in lang: |
| 823 | + # Already a valid EU language URI |
| 824 | + g.add((distribution, DCT.language, URIRef(lang))) |
| 825 | + else: |
| 826 | + uri = LANGUAGE_URI_MAPPING.get(lang, None) |
| 827 | + if uri: |
| 828 | + g.add((distribution, DCT.language, URIRef(uri))) |
| 829 | + else: |
| 830 | + log.debug(f"Language '{lang}' not found in language_uri_map") |
| 831 | + |
| 832 | + # Download URL & Access URL |
| 833 | + self.download_access_url(resource_dict, distribution, g) |
| 834 | + |
| 835 | + # Dates |
| 836 | + items = [ |
| 837 | + ("issued", DCT.issued, None, Literal), |
| 838 | + ("modified", DCT.modified, None, Literal), |
| 839 | + ] |
| 840 | + |
| 841 | + self._add_date_triples_from_dict(resource_dict, distribution, items) |
| 842 | + # ByteSize |
| 843 | + if resource_dict.get("byte_size"): |
| 844 | + g.add( |
| 845 | + (distribution, SCHEMA.byteSize, Literal(resource_dict["byte_size"])) |
| 846 | + ) |
| 847 | + |
| 848 | + super(SwissSchemaOrgProfile, self).graph_from_dataset(dataset_dict, dataset_ref) |
| 849 | + |
| 850 | + def parse_dataset(self, dataset_dict, dataset_ref): |
| 851 | + super(SwissSchemaOrgProfile, self).parse_dataset(dataset_dict, dataset_ref) |
0 commit comments