Merge pull request #2056 from OpenEnergyPlatform/feature/1931-update-the-opr-to-use-oemtadata-v2-specification

jh-RLI · web-flow · commit 7e4da1020f29 · 2025-07-24T13:37:01.000+02:00
Update the OPR to use metadata v2
diff --git a/dataedit/views.py b/dataedit/views.py
@@ -28,12 +28,12 @@
 #
 # SPDX-License-Identifier: AGPL-3.0-or-later
 
+
 import csv
 import json
 import logging
 import os
 import re
-from collections import defaultdict
 from functools import reduce
 from io import TextIOWrapper
 from itertools import chain
@@ -54,10 +54,9 @@
 from django.utils.encoding import smart_str
 from django.views.decorators.cache import never_cache
 from django.views.generic import View
-from oemetadata.v1.v160.schema import OEMETADATA_V160_SCHEMA
 
 # from oemetadata.v1.v160.template import OEMETADATA_V160_TEMPLATE
-# from oemetadata.v2.v20.schema import OEMETADATA_V20_SCHEMA
+from oemetadata.v2.v20.schema import OEMETADATA_V20_SCHEMA
 from oemetadata.v2.v20.template import OEMETADATA_V20_TEMPLATE
 
 # from oemetadata.v2.v20.example import OEMETADATA_V20_EXAMPLE
@@ -67,6 +66,9 @@
 import api.parser
 from api.actions import describe_columns
 
+# from oemetadata.v1.v160.schema import OEMETADATA_V160_SCHEMA
+
+
 try:
     import oeplatform.securitysettings as sec
 except Exception:
@@ -1978,7 +1980,7 @@ def load_json_schema(self):
         Returns:
             dict: JSON schema.
         """
-        json_schema = OEMETADATA_V160_SCHEMA
+        json_schema = OEMETADATA_V20_SCHEMA
         return json_schema
 
     def parse_keys(self, val, old=""):
@@ -2015,261 +2017,79 @@ def parse_keys(self, val, old=""):
 
     def sort_in_category(self, schema, table, oemetadata):
         """
-        Groups metadata fields by top-level categories and subgroups within them.
-        If a field has no dot (.), it's considered flat and shown directly.
-        If a field has a dot and includes an index (e.g., sources.0.name),
-        then all fields with the same index are grouped together and shown in order.
-        Adds display_prefix for human-readable (1-based) indexing.
-        """
-        import re
+        Group flattened OEMetadata v2 fields into thematic buckets and attach
+        placeholders required by the review UI.
 
-        def _plus_one_if_digit(txt: str) -> str:
-            return str(int(txt) + 1) if txt.isdigit() else txt
+        Each entry in the resulting lists has **five** keys:
 
-        val = self.parse_keys(oemetadata)
-
-        main_categories = {
-            "general": [],
-            "spatial": [],
-            "temporal": [],
-            "source": [],
-            "license": [],
-            "resource": [],
+        ```json
+        {
+          "field": "<dot‑notation path>",
+          "value": "<current value>",
+          "newValue": "",
+          "reviewer_suggestion": "",
+          "suggestion_comment": ""
         }
+        ```
 
-        for item in val:
-            field = item["field"]
-            top_key = field.split(".")[0]
-
-            if top_key in {
-                "name",
-                "title",
-                "id",
-                "description",
-                "language",
-                "subject",
-                "keywords",
-                "publicationDate",
-                "context",
-            }:
-                main_categories["general"].append(item)
-            elif top_key == "spatial":
-                main_categories["spatial"].append(item)
-            elif top_key == "temporal":
-                main_categories["temporal"].append(item)
-            elif top_key == "sources":
-                main_categories["source"].append(item)
-            elif top_key == "licenses":
-                main_categories["license"].append(item)
-            elif top_key == "resources":
-                main_categories["resource"].append(item)
-
-        def extract_index(prefix):
-            """
-            Return the numeric list index found at the end of *prefix*.
-
-            Works for both  'sources.0'  *and* 'Sources 0'.
-            If no trailing index exists, ``-1`` is returned to keep such
-            prefixes at the beginning of the ordered result.
-            """
-            match = re.search(r"(?:\.|\s)([0-9]+)$", prefix)
-            return int(match.group(1)) if match else -1
-
-        # ------------------------------------------------------------------
-        # Helper: inside each "Sources N" split into second‑level list groups
-        # e.g.  sources.<src_idx>.licenses.0.url      → "Licenses 0"
-        #       sources.<src_idx>.contacts.1.name     → "Contacts 1"
-        # Any path with pattern <listname>.<index> after the 2‑nd segment
-        # becomes its own accordion; everything else stays flat.
-        # ------------------------------------------------------------------
-        def nest_sublist_groups(source_items):
-            """
-            Turn one Source‑block (list of dicts) into:
-              {
-                "flat":     [ ... items without  <listname>.<idx> ... ],
-                "grouped":  { "<Listname> 1": [...], "<Listname> 2": [...], ... }
-              }
-
-            Accepts arbitrary list names, not just 'licenses'.
-            """
-            nested = {"flat": [], "grouped": defaultdict(list)}
-
-            for itm in source_items:
-                parts = itm["field"].split(".")
-                # expect pattern: sources.<src_idx>.<listname>.<index>.<rest>
-                if (
-                    len(parts) >= 4
-                    and parts[2].isidentifier()  # list name
-                    and parts[3].isdigit()  # index
-                ):
-                    list_name = parts[2]  # e.g. "licenses"
-                    idx = parts[3]  # e.g. "0"
-                    display_idx = int(idx) + 1
-                    group_key = f"{list_name.capitalize()} {display_idx}"
-                    display_field = ".".join(parts[4:]) or "value"
-
-                    enriched = itm.copy()
-                    enriched["display_field"] = display_field
-                    enriched["display_prefix"] = group_key
-                    enriched["display_index"] = idx
-                    nested["grouped"][group_key].append(enriched)
-                else:
-                    # keep as flat inside this Source block
-                    trimmed = ".".join(parts[2:]) if len(parts) > 2 else itm["field"]
-                    enriched = itm.copy()
-                    enriched["display_field"] = _plus_one_if_digit(trimmed)
-                    nested["flat"].append(enriched)
-
-            # sort groups numerically (… 1, 2, 3 …) within each list name
-            nested["grouped"] = dict(
-                sorted(
-                    nested["grouped"].items(),
-                    key=lambda kv: (
-                        kv[0].split()[0],  # list name
-                        int(kv[0].split()[-1]),  # numeric index
-                    ),
-                )
-            )
-            return nested
-
-        def group_index_only(items):
-            result = {"flat": [], "grouped": defaultdict(list)}
-
-            for item in items:
-                field = item["field"]
-                parts = field.split(".")
-
-                list_idx = None
-                list_name = None
-                idx_pos = None
-                for pos in range(1, len(parts)):
-                    if parts[pos].isdigit():
-                        list_idx = parts[pos]  # '0'
-                        list_name = parts[pos - 1]  # 'timeseries'
-                        idx_pos = pos
-                        break
+        Buckets returned:
 
-                if list_idx is not None:
-                    # «Timeseries 1», «Bbox 2» …
-                    display_idx = int(list_idx) + 1  # show 1‑based index
-                    group_key = f"{list_name.capitalize()} {display_idx}"
-                    display_field = (
-                        ".".join(parts[idx_pos + 1 :])
-                        if idx_pos + 1 < len(parts)
-                        else ""
-                    )
+        * general
+        * spatial
+        * temporal
+        * source
+        * license
+        """
 
-                    enriched = item.copy()
-                    enriched["display_field"] = display_field
-                    enriched["display_prefix"] = group_key
-                    enriched["display_index"] = list_idx
-                    result["grouped"][group_key].append(enriched)
-                else:
-                    trimmed = field.split(".", 1)[1] if "." in field else field
-                    item["display_field"] = _plus_one_if_digit(trimmed)
-                    item.pop("display_index", None)
-                    result["flat"].append(item)
+        from collections import defaultdict
 
-            result["grouped"] = dict(
-                sorted(result["grouped"].items(), key=lambda kv: int(kv[0].split()[-1]))
-            )
-            return result
-
-        def group_by_index(items):
-            """
-            Organise *items* into
-              * ``flat``     – fields without any nesting,
-              * ``grouped``  – dict whose keys are human‑readable list titles
-                               such as 'Timeseries 1', 'Sources 2', …
-
-            All fields that share the same list index (e.g. timeseries.0.*)
-            are collected under one group.  The groups are ordered by their
-            numeric index so that 1, 2, 3 … appear in sequence.
-            """
-            result = {"flat": [], "grouped": defaultdict(list)}
-
-            for item in items:
-                field = item["field"]
-                parts = field.split(".")
-
-                # Handle list elements like timeseries.0.start
-                if len(parts) >= 3 and parts[1].isdigit():
-                    index = parts[1]  # '0'
-                    display_idx = int(index) + 1
-                    group_key = (
-                        f"{parts[0].capitalize()} {display_idx}"  # 'Timeseries 1'
-                    )
-                    display_field = ".".join(parts[2:])  # 'start'
+        # Flatten the nested JSON into [{'field': k, 'value': v}, ...]
+        flattened = self.parse_keys(oemetadata)
+        flattened = [
+            item for item in flattened if item["field"].startswith("resources.")
+        ]
 
-                    enriched = item.copy()
-                    enriched["display_field"] = display_field
-                    enriched["display_prefix"] = group_key
-                    enriched["display_index"] = index
+        bucket_map = {
+            "spatial": "spatial",
+            "temporal": "temporal",
+            "sources": "source",
+            "licenses": "license",
+        }
 
-                    result["grouped"][group_key].append(enriched)
+        tmp = defaultdict(list)
 
-                # Handle nested (but non‑list) structures, e.g. spatial.epsg
-                elif "." in field:
-                    group_key = field.split(".")[0]  # 'spatial'
-                    enriched = item.copy()
-                    raw_tail = ".".join(field.split(".")[1:])
-                    enriched["display_field"] = _plus_one_if_digit(raw_tail)
-                    enriched["display_prefix"] = group_key
-                    enriched.pop("display_index", None)
+        for item in flattened:
+            raw_key = item["field"]
+            parts = raw_key.split(".")
 
-                    result["grouped"][group_key].append(enriched)
+            # Detect v2 resource path → resources.<idx>.<root>.…
+            if parts[0] == "resources" and len(parts) >= 3:
+                root = parts[2]
+            else:
+                root = parts[0]
 
-                # Handle completely flat fields
-                else:
-                    item["display_field"] = field
-                    item["display_field"] = _plus_one_if_digit(item["display_field"])
-                    item.pop("display_index", None)
-                    result["flat"].append(item)
-
-            # Sort grouped entries by their numeric index (Timeseries 1, 2, 3 …)
-            sorted_grouped = dict(
-                sorted(result["grouped"].items(), key=lambda kv: extract_index(kv[0]))
-            )
-            return {"flat": result["flat"], "grouped": sorted_grouped}
-
-        grouped_meta = {}
-        for cat, items in main_categories.items():
-            if cat in {"spatial", "temporal"}:
-                grouped = group_index_only(items)  # only list‑index grouping
-            elif cat == "source":
-                # First‑level grouping: Source 0, Source 1, …
-                src_level = group_index_only(items)
-
-                # For every 'Sources N' list build inner sublist groups
-                nested_grouped = {}
-                for src_key, src_items in src_level["grouped"].items():
-                    nested_grouped[src_key] = nest_sublist_groups(src_items)
-
-                grouped = {
-                    "flat": src_level["flat"],
-                    "grouped": nested_grouped,
-                }
-            elif cat == "license":
-                # First‑level grouping: License 0, License 1, …
-                lic_level = group_index_only(items)
-
-                # For every 'Licenses N' entry build inner sub‑list groups
-                nested_grouped_lic = {}
-                for lic_key, lic_items in lic_level["grouped"].items():
-                    nested_grouped_lic[lic_key] = nest_sublist_groups(lic_items)
-
-                grouped = {
-                    "flat": lic_level["flat"],
-                    "grouped": nested_grouped_lic,
+            bucket = bucket_map.get(root, "general")
+
+            # Extend structure with placeholders expected by review workflow
+            tmp[bucket].append(
+                {
+                    "field": raw_key,
+                    "value": item["value"],
+                    "newValue": "",
+                    "reviewer_suggestion": "",
+                    "suggestion_comment": "",
                 }
-            else:
-                grouped = group_by_index(items)  # previous behaviour
-            grouped_meta[cat] = {
-                "flat": grouped["flat"],
-                "grouped": grouped["grouped"],
-            }
+            )
 
-        return grouped_meta
+        # Guarantee keys exist even when empty
+        buckets = {
+            "general": tmp["general"],
+            "spatial": tmp["spatial"],
+            "temporal": tmp["temporal"],
+            "source": tmp["source"],
+            "license": tmp["license"],
+        }
+        return buckets
 
     def get_all_field_descriptions(self, json_schema, prefix=""):
         """
@@ -2296,12 +2116,17 @@ def extract_descriptions(properties, prefix=""):
 
                 if any(
                     attr in value
-                    for attr in ["description", "example", "badge", "title"]
+                    for attr in ["description", "examples", "example", "badge", "title"]
                 ):
                     field_descriptions[key] = {}
                     if "description" in value:
                         field_descriptions[key]["description"] = value["description"]
-                    if "example" in value:
+                    # Prefer v2 "examples" (array) over v1 "example" (single value)
+                    if "examples" in value and value["examples"]:
+                        # v2: first item of the examples array
+                        field_descriptions[key]["example"] = value["examples"][0]
+                    elif "example" in value:
+                        # v1 fallback
                         field_descriptions[key]["example"] = value["example"]
                     if "badge" in value:
                         field_descriptions[key]["badge"] = value["badge"]