Merge pull request #268 from sciknoworg/dev

HamedBabaei · web-flow · commit b6fc1ee374a0 · 2025-08-22T22:30:23.000+02:00
add ontolearner Dublin Core metadata exporter
diff --git a/.github/workflows/python-publish.yml b/.github/workflows/python-publish.yml
@@ -37,3 +37,18 @@ jobs:
       - name: Publish to PyPI
         run: |
           poetry publish --no-interaction --no-ansi
+
+      # 🔹 NEW STEP: Generate metadata after publishing
+      - name: Generate Dublin Core metadata
+        run: |
+          mkdir -p metadata
+          poetry run python -c "from ontolearner import OntoLearnerMetadataExporter; OntoLearnerMetadataExporter().export('metadata/ontolearner-metadata.rdf')"
+
+      # 🔹 Commit metadata back to repo
+      - name: Commit and push metadata
+        run: |
+          git config --global user.name "github-actions[bot]"
+          git config --global user.email "github-actions[bot]@users.noreply.github.com"
+          git add metadata/
+          git commit -m ":bookmark: Update metadata after release"
+          git push origin HEAD:main
diff --git a/docs/source/index.rst b/docs/source/index.rst
@@ -185,6 +185,7 @@ or GitHub repository:
    ontologizer/ontology_modularization
    ontologizer/ontology_hosting
    ontologizer/new_ontologies
+   ontologizer/metadata
 
 .. toctree::
    :maxdepth: 1
diff --git a/docs/source/ontologizer/metadata.rst b/docs/source/ontologizer/metadata.rst
@@ -0,0 +1,150 @@
+Metadata
+=============================
+
+.. note::
+
+	OntoLearner Metadata will be created automatically at Github under `metadata/ <https://github.com/sciknoworg/OntoLearner/tree/main/metadata>`_ directory, and it is available for download after ``ontolearner > 1.3.1`` also at `Releases <https://github.com/sciknoworg/OntoLearner/releases>`_ per release.
+
+.. hint::
+
+	The metadata release is fully automated through CI/CD, ensuring it is generated automatically with each PyPI release.
+
+.. sidebar:: OntoLearner Metadata Exporter Features
+
+	- Generates `Dublin Core metadata <https://www.dublincore.org/specifications/dublin-core/dces/>`_ for each ontology in the library
+	- Creates a top-level ``Collection`` resource for OntoLearner
+	- Supports RDF/XML serialization in a clean, human-readable format
+	- Uses a custom ``ontologizer`` namespace for ontology-specific resources
+
+
+The ``OntoLearnerMetadataExporter`` is a utility class for generating **Dublin Core (DCMI) metadata** for all ontologies benchmarked in the OntoLearner library. It collects essential metadata, including ontology title and description, creator/authors, license information, format, version, and last updated date, domain and category, and download URL. Additionally, it generates a **top-level collection resource** that describes the entire OntoLearner benchmarking suite. The output is a **pretty-printed RDF/XML file** compatible with standard semantic web tools and parsers.
+
+
+**Example RDF structure:**
+
+.. code-block:: xml
+
+    <rdf:RDF
+      xmlns:dc="http://purl.org/dc/elements/1.1/"
+      xmlns:dcterms="http://purl.org/dc/terms/"
+      xmlns:ontologizer="https://ontolearner.readthedocs.io/ontologizer/ontology_modularization.html#"
+      xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#">
+
+      <!-- Top-level collection -->
+      <ontologizer:Collection rdf:about="https://ontolearner.readthedocs.io/benchmarking/">
+        <dc:title>OntoLearner Benchmark Ontologies</dc:title>
+        <dc:description>This Dublin Core metadata collection describes ontologies benchmarked in OntoLearner. It includes information such as title, creator, format, license, and version.</dc:description>
+        <dc:creator>OntoLearner Team</dc:creator>
+        <dcterms:license>MIT License</dcterms:license>
+        <dcterms:hasVersion>1.4.0</dcterms:hasVersion>
+      </ontologizer:Collection>
+
+      <!-- Individual ontology metadata -->
+      <ontologizer:Ontology rdf:about="https://ontolearner.readthedocs.io/benchmarking/medicine/ncit.html">
+        <dc:identifier>NCIt</dc:identifier>
+        <dcterms:title>NCI Thesaurus (NCIt)</dcterms:title>
+        <dcterms:description>NCI Thesaurus (NCIt) is a reference terminology that includes broad coverage of the cancer domain...</dcterms:description>
+        <dcterms:format>OWL</dcterms:format>
+        <dcterms:date>2023-10-19</dcterms:date>
+        <dcterms:license>Creative Commons 4.0</dcterms:license>
+        <dcterms:source>https://terminology.tib.eu/ts/ontologies/NCIT</dcterms:source>
+        <dcterms:subject>Medicine</dcterms:subject>
+        <dcterms:subject>Cancer, Oncology</dcterms:subject>
+        <dcterms:hasVersion>24.04e</dcterms:hasVersion>
+      </ontologizer:Ontology>
+
+    </rdf:RDF>
+
+
+Properties
+-------------------------------------
+The following table summarizes the key **Dublin Core metadata properties** captured for each ontology in OntoLearner. It provides a quick overview of the ontology’s identifier, title, description, authorship, format, license, domain, and version information, helping users understand and reference the ontologies consistently.
+
+.. list-table:: **OntoLearner Metadata Properties**
+   :header-rows: 0
+   :widths: 40 40 40
+
+   * - **Property**
+     - **Example**
+     - **Description**
+   * - ``dc:identifier``
+     - NCIt
+     - Ontology ID
+   * - ``dcterms:title``
+     - NCI Thesaurus (NCIt)
+     - Ontology full name
+   * - ``dcterms:description``
+     - See above example RDF structure
+     - Detailed ontology description
+   * - ``dcterms:creator``
+     - NCI
+     - Creator / author
+   * - ``dcterms:format``
+     - OWL
+     - Ontology format
+   * - ``dcterms:date``
+     - 2023-10-19
+     - Last updated
+   * - ``dcterms:license``
+     - Creative Commons 4.0
+     - License information
+   * - ``dcterms:source``
+     - URL
+     - Download or reference URL
+   * - ``dcterms:subject``
+     - Medicine
+     - Domain or category
+   * - ``dcterms:hasVersion``
+     - 24.04e
+     - Ontology version
+
+The following represents the benchmark collection info. The `dcterms:hasVersion` represents the library version that the metadata was released.
+
+.. code-block:: xml
+
+	<ontologizer:Collection rdf:about="https://ontolearner.readthedocs.io/benchmarking/">
+        <dc:title>OntoLearner Benchmark Ontologies</dc:title>
+        <dc:description>This Dublin Core metadata collection describes ontologies benchmarked in OntoLearner. It includes information such as title, creator, format, license, and version.</dc:description>
+        <dc:creator>OntoLearner Team</dc:creator>
+        <dcterms:license>MIT License</dcterms:license>
+        <dcterms:hasVersion>1.4.0</dcterms:hasVersion>
+      </ontologizer:Collection>
+
+Exporter
+--------------------
+
+``OntoLearnerMetadataExporter`` is included in the OntoLearner library, which you can store the ontology locally.
+
+.. code-block:: python
+
+    from ontolearner import OntoLearnerMetadataExporter
+
+    # Initialize exporter
+    exporter = OntoLearnerMetadataExporter()
+
+    # Export metadata to RDF/XML
+    exporter.export("ontolearner-metadata.rdf")
+
+The above code outputs:
+
+- **File:** ``ontolearner-metadata.rdf``
+- **Format:** Pretty-printed RDF/XML
+- **Content:** metadata for each ontology
+
+The top-level collection describes the entire OntoLearner benchmark, while each ontology entry includes detailed metadata using Dublin Core and DCTERMS properties.
+
+.. hint::
+
+	**Namespace Bindings:** The exporter uses the following namespaces in the RDF output:
+
+	- ``dc``: ``http://purl.org/dc/elements/1.1/``
+	- ``dcterms``: ``http://purl.org/dc/terms/``
+	- ``ontologizer``: ``https://ontolearner.readthedocs.io/ontologizer/ontology_modularization.html#``
+	- ``rdf``: ``http://www.w3.org/1999/02/22-rdf-syntax-ns#``
+
+.. note::
+
+	- The **Collection resource** always appears first in the RDF/XML output.
+	- Individual ontologies are serialized as ``ontologizer:Ontology`` resources.
+	- The ``export()`` method automatically reads the OntoLearner library version from the ``VERSION`` file.
+	- The RDF/XML output is compatible with standard semantic web tools like **Protégé**, **RDFLib**, and **Apache Jena**.
diff --git a/metadata/metadata-exporter.py b/metadata/metadata-exporter.py
@@ -0,0 +1,6 @@
+
+from ontolearner import OntoLearnerMetadataExporter
+
+exporter = OntoLearnerMetadataExporter()
+
+exporter.export("metadata.rdf")
diff --git a/ontolearner/VERSION b/ontolearner/VERSION
@@ -0,0 +1 @@
+1.3.1
diff --git a/ontolearner/__init__.py b/ontolearner/__init__.py
@@ -11,8 +11,10 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
+from pathlib import Path
 
-__version__ = "1.3.1"
+# Load version from VERSION file
+__version__ = (Path(__file__).parent / "VERSION").read_text().strip()
 
 import logging
 from ontolearner import (ontology,
@@ -22,7 +24,7 @@
                          tools,
                          data_structure)
 from .ontology import * # noqa
-from ._ontology import AutoOntology
+from ._ontology import AutoOntology, OntoLearnerMetadataExporter
 from .learner import (AutoLLMLearner,
                       AutoRetrieverLearner,
                       AutoRAGLearner,
@@ -38,6 +40,7 @@
 __all__ = [
     "AutoLLMLearner",
     "AutoOntology",
+    "OntoLearnerMetadataExporter",
     "AutoRetrieverLearner",
     "AutoRAGLearner",
     "StandardizedPrompting",
diff --git a/ontolearner/_ontology.py b/ontolearner/_ontology.py
@@ -13,6 +13,11 @@
 # limitations under the License.
 
 import inspect
+import os
+import xml.etree.ElementTree as ET
+from rdflib import Graph, URIRef, Literal, Namespace, RDF
+from xml.dom import minidom
+
 import ontolearner.ontology as ontology_module
 from .base import BaseOntology
 
@@ -38,12 +43,12 @@ class AutoOntology:
        Example:
            >>> auto_onto = AutoOntology("AgrO")
            >>> print(type(auto_onto))
-           <class 'ontolearner.ontology.AgrO'>
+           >>> <class 'ontolearner.ontology.AgrO'>
 
            If no class matches "unknownontology":
            >>> auto_onto = AutoOntology("unknownontology")
            >>> print(type(auto_onto))
-           <class 'ontolearner.base.BaseOntology'>
+           >>> <class 'ontolearner.base.BaseOntology'>
        """
     def __new__(self, ontology_id) -> BaseOntology:
         for name, obj in inspect.getmembers(ontology_module):
@@ -53,3 +58,94 @@ def __new__(self, ontology_id) -> BaseOntology:
                     if str(obj).split("'")[-2].split(".")[-1].lower() == ontology_id.lower():
                         return instance
         return BaseOntology()
+
+
+
+class OntoLearnerMetadataExporter:
+    """Generates Dublin Core metadata for ontology classes."""
+    def __init__(self):
+        self.format: str = "pretty-xml"
+
+    def get_url(self, domain, ontology_id):
+        return f"https://ontolearner.readthedocs.io/benchmarking/{domain.lower().replace(' ', '_')}/{ontology_id.lower()}.html"
+
+    def export(self, path: str = "DCMI-Metadata.rdf"):
+        DC = Namespace("http://purl.org/dc/elements/1.1/")
+        DCTERMS = Namespace("http://purl.org/dc/terms/")
+        ONTOLOGIZER = Namespace("https://ontolearner.readthedocs.io/ontologizer/ontology_modularization.html#")
+
+        g_head = Graph()
+        g_head.bind("dc", DC)
+        g_head.bind("dcterms", DCTERMS)
+        g_head.bind("ontologizer", ONTOLOGIZER)
+
+        collection_uri = URIRef("https://ontolearner.readthedocs.io/benchmarking/")
+        g_head.add((collection_uri, RDF.type, ONTOLOGIZER.Collection))
+        g_head.add((collection_uri, DC.title, Literal("OntoLearner Benchmark Ontologies")))
+        g_head.add((collection_uri, DC.description, Literal(
+            "This Dublin Core metadata collection describes ontologies benchmarked in OntoLearner. "
+            "It includes information such as title, creator, format, license, and version."
+        )))
+        g_head.add((collection_uri, DC.creator, Literal("OntoLearner Team")))
+        g_head.add((collection_uri, DCTERMS.license, Literal("MIT License")))
+        g_head.add((collection_uri, DCTERMS.hasVersion,
+                    Literal(open(os.path.join(os.path.dirname(__file__), 'VERSION')).read().strip())))
+
+        g_body = Graph()
+        g_body.bind("dc", DC)
+        g_body.bind("dcterms", DCTERMS)
+        g_body.bind("ontologizer", ONTOLOGIZER)
+
+        for name, obj in inspect.getmembers(ontology_module):
+            if inspect.isclass(obj) and name != "BaseOntology":
+                if hasattr(obj, 'load') and callable(getattr(obj, 'load')) and hasattr(obj, 'ontology_id'):
+                    onto = obj()
+                    uri = URIRef(self.get_url(onto.domain, onto.ontology_id))
+                    g_body.add((uri, RDF.type, ONTOLOGIZER.Ontology))
+                    g_body.add((uri, DC.identifier, Literal(onto.ontology_id)))
+                    g_body.add((uri, DCTERMS.title, Literal(onto.ontology_full_name)))
+                    g_body.add((uri, DCTERMS.description, Literal(onto.__doc__.replace("\n", " "))))
+                    if onto.creator:
+                        g_body.add((uri, DCTERMS.creator, Literal(onto.creator)))
+                    if onto.format:
+                        g_body.add((uri, DCTERMS['format'], Literal(onto.format)))
+                    if onto.last_updated:
+                        g_body.add((uri, DCTERMS.date, Literal(onto.last_updated)))
+                    if onto.license:
+                        g_body.add((uri, DCTERMS.license, Literal(onto.license)))
+                    if onto.download_url:
+                        g_body.add((uri, DCTERMS.source, Literal(onto.download_url)))
+                    if onto.domain:
+                        g_body.add((uri, DCTERMS.subject, Literal(onto.domain)))
+                    if onto.category:
+                        g_body.add((uri, DCTERMS.subject, Literal(onto.category)))
+                    if onto.version:
+                        g_body.add((uri, DCTERMS.hasVersion, Literal(onto.version)))
+
+        head_xml = g_head.serialize(format=self.format)
+        body_xml = g_body.serialize(format=self.format)
+        nsmap = {
+            "rdf": "http://www.w3.org/1999/02/22-rdf-syntax-ns#",
+            "dc": "http://purl.org/dc/elements/1.1/",
+            "dcterms": "http://purl.org/dc/terms/",
+            "ontologizer": str(ONTOLOGIZER),
+        }
+        for p, u in nsmap.items():
+            ET.register_namespace(p, u)
+
+        head_root = ET.fromstring(head_xml)
+        body_root = ET.fromstring(body_xml)
+
+        rdf_tag = f'{{{nsmap["rdf"]}}}RDF'
+        merged_root = ET.Element(rdf_tag)
+        for child in list(head_root):
+            merged_root.append(child)
+        for child in list(body_root):
+            merged_root.append(child)
+
+        rough_str = ET.tostring(merged_root, encoding="utf-8")
+        reparsed = minidom.parseString(rough_str)
+        pretty_str = reparsed.toprettyxml(indent="    ", encoding="utf-8").decode("utf-8")
+        pretty_str = "\n".join([line for line in pretty_str.splitlines() if line.strip()])
+        with open(path, "w", encoding="utf-8") as f:
+            f.write(pretty_str)
diff --git a/pyproject.toml b/pyproject.toml
@@ -1,8 +1,6 @@
 [tool.poetry]
 name = "OntoLearner"
-
-version = "1.3.1"
-
+version = "0.0.0"  # placeholder, will be replaced automatically
 description = "OntoLearner: A Modular Python Library for Ontology Learning with LLMs."
 authors = ["Hamed Babaei Giglou <hamedbabaeigiglou@gmail.com>", "Andrei C. Aioanei <andrei.c.aioanei@gmail.com>"]
 license = "MIT License"
@@ -40,6 +38,12 @@ wheel = "*"
 twine = "*"
 pytest = "*"
 
+[tool.poetry-dynamic-versioning]
+enable = true
+vcs = "git"
+style = "semver"
+pattern = "tag"
+
 [build-system]
-requires = ["poetry-core>=1.0.0"]
-build-backend = "poetry.core.masonry.api"
+requires = ["poetry-core>=1.0.0", "poetry-dynamic-versioning>=1.4.0"]
+build-backend = "poetry_dynamic_versioning.backend"
diff --git a/setup.py b/setup.py
@@ -1,11 +1,12 @@
 from setuptools import setup, find_packages
+import os
 
 with open("README.md", encoding="utf-8") as f:
     long_description = f.read()
 
 setup(
     name="OntoLearner",
-    version="1.3.1",
+    version=open(os.path.join(os.path.dirname(__file__), 'ontolearner/VERSION')).read().strip(),
     author="Hamed Babaei Giglou, Andrei C. Aioanei",
     author_email="hamedbabaeigiglou@gmail.com, andrei.c.aioanei@gmail.com",
     description="OntoLearner: A Modular Python Library for Ontology Learning with LLMs.",