Skip to content

Commit 8f9e39f

Browse files
Merge pull request #14 from deepesdl/tejas-xxx-comply-with-latest-validation
Comply with latest validation and add more related links to workflow record
2 parents cdc941e + febcc45 commit 8f9e39f

File tree

11 files changed

+268
-25
lines changed

11 files changed

+268
-25
lines changed

deep_code/constants.py

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -29,3 +29,13 @@
2929
".json"
3030
)
3131
PROJECT_COLLECTION_NAME = "deep-earth-system-data-lab"
32+
DEEPESDL_GIT_PULL_BASE = (
33+
"https://deep.earthsystemdatalab.net/hub/user-redirect/git-pull"
34+
)
35+
APPLICATION_TYPE_JUPYTER_SPEC = (
36+
"https://raw.githubusercontent.com/EOEPCA/metadata"
37+
"-profile/refs/heads/1.0/schemas/application-type-jupyter-notebook"
38+
)
39+
APPLICATION_STAC_EXTENSION_SPEC = (
40+
"https://stac-extensions.github.io/application/v0.1.0/schema.json"
41+
)

deep_code/tests/tools/test_publish.py

Lines changed: 58 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4,10 +4,12 @@
44
from pathlib import Path
55
from unittest.mock import MagicMock, mock_open, patch
66

7+
import pytest
78
import yaml
89
from pystac import Catalog
910

1011
from deep_code.tools.publish import Publisher
12+
from deep_code.utils.ogc_api_record import LinksBuilder
1113

1214

1315
class TestPublisher(unittest.TestCase):
@@ -107,3 +109,59 @@ def test_read_config_files(self):
107109
# Assertions
108110
self.assertEqual(self.publisher.dataset_config, dataset_config)
109111
self.assertEqual(self.publisher.workflow_config, workflow_config)
112+
113+
114+
class TestParseGithubNotebookUrl:
115+
@pytest.mark.parametrize(
116+
"url,repo_url,repo_name,branch,file_path",
117+
[
118+
(
119+
"https://github.com/deepesdl/cube-gen/blob/main/Permafrost/Create-CCI-Permafrost-cube-EarthCODE.ipynb",
120+
"https://github.com/deepesdl/cube-gen",
121+
"cube-gen",
122+
"main",
123+
"Permafrost/Create-CCI-Permafrost-cube-EarthCODE.ipynb",
124+
),
125+
(
126+
"https://github.com/deepesdl/cube-gen/tree/release-1.0/Permafrost/Create-CCI-Permafrost-cube-EarthCODE.ipynb",
127+
"https://github.com/deepesdl/cube-gen",
128+
"cube-gen",
129+
"release-1.0",
130+
"Permafrost/Create-CCI-Permafrost-cube-EarthCODE.ipynb",
131+
),
132+
(
133+
"https://raw.githubusercontent.com/deepesdl/cube-gen/main/Permafrost/Create-CCI-Permafrost-cube-EarthCODE.ipynb",
134+
"https://github.com/deepesdl/cube-gen",
135+
"cube-gen",
136+
"main",
137+
"Permafrost/Create-CCI-Permafrost-cube-EarthCODE.ipynb",
138+
),
139+
],
140+
)
141+
def test_valid_urls(self, url, repo_url, repo_name, branch, file_path):
142+
got_repo_url, got_repo_name, got_branch, got_file_path = LinksBuilder._parse_github_notebook_url(
143+
url
144+
)
145+
assert got_repo_url == repo_url
146+
assert got_repo_name == repo_name
147+
assert got_branch == branch
148+
assert got_file_path == file_path
149+
150+
def test_invalid_domain(self):
151+
url = "https://gitlab.com/deepesdl/cube-gen/-/blob/main/Permafrost/Create-CCI-Permafrost-cube-EarthCODE.ipynb"
152+
with pytest.raises(ValueError) as e:
153+
LinksBuilder._parse_github_notebook_url(url)
154+
assert "Only GitHub URLs are supported" in str(e.value)
155+
156+
def test_unexpected_github_format_missing_blob_or_tree(self):
157+
# Missing the "blob" or "tree" segment
158+
url = "https://github.com/deepesdl/cube-gen/main/Permafrost/Create-CCI-Permafrost-cube-EarthCODE.ipynb"
159+
with pytest.raises(ValueError) as e:
160+
LinksBuilder._parse_github_notebook_url(url)
161+
assert "Unexpected GitHub URL format" in str(e.value)
162+
163+
def test_unexpected_raw_format_too_short(self):
164+
url = "https://raw.githubusercontent.com/deepesdl/cube-gen/main"
165+
with pytest.raises(ValueError) as e:
166+
LinksBuilder._parse_github_notebook_url(url)
167+
assert "Unexpected raw.githubusercontent URL format" in str(e.value)

deep_code/tests/utils/test_dataset_stac_generator.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -65,8 +65,11 @@ def setUp(self, mock_data_store):
6565
self.generator = OscDatasetStacGenerator(
6666
dataset_id="mock-dataset-id",
6767
collection_id="mock-collection-id",
68+
workflow_id="dummy",
69+
workflow_title="test",
6870
access_link="s3://mock-bucket/mock-dataset",
6971
documentation_link="https://example.com/docs",
72+
license_type="proprietary",
7073
osc_status="ongoing",
7174
osc_region="Global",
7275
osc_themes=["climate", "environment"],

deep_code/tests/utils/test_ogc_api_record.py

Lines changed: 16 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,10 @@
11
import unittest
22

3-
from deep_code.constants import OGC_API_RECORD_SPEC
3+
from deep_code.constants import (
4+
APPLICATION_STAC_EXTENSION_SPEC,
5+
APPLICATION_TYPE_JUPYTER_SPEC,
6+
OGC_API_RECORD_SPEC,
7+
)
48
from deep_code.utils.ogc_api_record import (
59
Contact,
610
ExperimentAsOgcRecord,
@@ -136,7 +140,9 @@ def test_record_properties_to_dict(self):
136140

137141
class TestLinksBuilder(unittest.TestCase):
138142
def test_build_theme_links_for_records(self):
139-
links_builder = LinksBuilder(themes=["climate", "ocean"])
143+
links_builder = LinksBuilder(
144+
themes=["climate", "ocean"], jupyter_kernel_info={}
145+
)
140146
theme_links = links_builder.build_theme_links_for_records()
141147

142148
expected_links = [
@@ -201,7 +207,14 @@ def test_workflow_as_ogc_record_initialization(self):
201207
workflow_record.jupyter_notebook_url, "https://example.com/notebook.ipynb"
202208
)
203209
self.assertEqual(workflow_record.properties, record_properties)
204-
self.assertEqual(workflow_record.conformsTo, [OGC_API_RECORD_SPEC])
210+
self.assertEqual(
211+
workflow_record.conformsTo,
212+
[
213+
OGC_API_RECORD_SPEC,
214+
APPLICATION_TYPE_JUPYTER_SPEC,
215+
APPLICATION_STAC_EXTENSION_SPEC,
216+
],
217+
)
205218
self.assertEqual(workflow_record.links[0]["rel"], "root")
206219
self.assertEqual(workflow_record.links[-1]["rel"], "self")
207220

deep_code/tools/new.py

Lines changed: 10 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -20,7 +20,11 @@ def generate_workflow_template(output_path: Optional[str] = None) -> str:
2020
"title": "[Human-readable title of the workflow]",
2121
"description": "[A concise summary of what the workflow does]",
2222
"keywords": ["[KEYWORD1]", "[KEYWORD2]"],
23-
"themes": ["[Thematic area(s) of focus (e.g. land, ocean, atmosphere)]","[THEME1]", "[THEME2]"],
23+
"themes": [
24+
"[Thematic area(s) of focus (e.g. land, ocean, atmosphere)]",
25+
"[THEME1]",
26+
"[THEME2]",
27+
],
2428
"license": "[License type (e.g. MIT, Apache-2.0, CC-BY-4.0, proprietary)]",
2529
"jupyter_kernel_info": {
2630
"name": "[Name of the execution environment or notebook kernel]",
@@ -61,8 +65,11 @@ def generate_dataset_template(output_path: Optional[str] = None) -> str:
6165
template = {
6266
"dataset_id": "[The name of the dataset object within your S3 bucket].zarr",
6367
"collection_id": "[A unique identifier for the dataset collection]",
64-
"osc_themes": ["[Oceans]", "[Open Science theme (choose from "
65-
"https://opensciencedata.esa.int/themes/catalog)"],
68+
"osc_themes": [
69+
"[Oceans]",
70+
"[Open Science theme (choose from "
71+
"https://opensciencedata.esa.int/themes/catalog)",
72+
],
6673
"osc_region": "[Geographical coverage, e.g. 'global']",
6774
"dataset_status": "[Status of the dataset: 'ongoing', 'completed', or 'planned']",
6875
"documentation_link": "[Link to relevant documentation, publication, or handbook]",

deep_code/tools/publish.py

Lines changed: 27 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -4,12 +4,12 @@
44
# https://opensource.org/licenses/MIT.
55

66
import copy
7-
import json
87
import logging
98
from datetime import datetime
109
from pathlib import Path
1110

1211
import fsspec
12+
import jsonpickle
1313
import yaml
1414
from pystac import Catalog, Link
1515

@@ -22,7 +22,6 @@
2222
)
2323
from deep_code.utils.dataset_stac_generator import OscDatasetStacGenerator
2424
from deep_code.utils.github_automation import GitHubAutomation
25-
from deep_code.utils.helper import serialize
2625
from deep_code.utils.ogc_api_record import (
2726
ExperimentAsOgcRecord,
2827
LinksBuilder,
@@ -130,6 +129,7 @@ def __init__(
130129
self._read_config_files()
131130
self.collection_id = self.dataset_config.get("collection_id")
132131
self.workflow_title = self.workflow_config.get("properties", {}).get("title")
132+
self.workflow_id = self.workflow_config.get("workflow_id")
133133

134134
if not self.collection_id:
135135
raise ValueError("collection_id is missing in dataset config.")
@@ -151,11 +151,12 @@ def _write_to_file(file_path: str, data: dict):
151151
# Create the directory if it doesn't exist
152152
Path(file_path).parent.mkdir(parents=True, exist_ok=True)
153153
try:
154-
json_content = json.dumps(data, indent=2, default=serialize)
154+
# unpicklable=False -> plain JSON (drops type metadata); cycles are resolved.
155+
json_content = jsonpickle.encode(data, unpicklable=False, indent=2)
155156
except TypeError as e:
156157
raise RuntimeError(f"JSON serialization failed: {e}")
157158

158-
with open(file_path, "w") as f:
159+
with open(file_path, "w", encoding="utf-8") as f:
159160
f.write(json_content)
160161

161162
def _update_and_add_to_file_dict(
@@ -217,6 +218,7 @@ def publish_dataset(self, write_to_file: bool = False):
217218
osc_region = self.dataset_config.get("osc_region")
218219
osc_themes = self.dataset_config.get("osc_themes")
219220
cf_params = self.dataset_config.get("cf_parameter")
221+
license_type = self.dataset_config.get("license_type")
220222

221223
if not dataset_id or not self.collection_id:
222224
raise ValueError("Dataset ID or Collection ID missing in the config.")
@@ -226,6 +228,9 @@ def publish_dataset(self, write_to_file: bool = False):
226228
generator = OscDatasetStacGenerator(
227229
dataset_id=dataset_id,
228230
collection_id=self.collection_id,
231+
workflow_id=self.workflow_id,
232+
workflow_title=self.workflow_title,
233+
license_type=license_type,
229234
documentation_link=documentation_link,
230235
access_link=access_link,
231236
osc_status=dataset_status,
@@ -310,7 +315,7 @@ def _update_base_catalog(
310315

311316
return base_catalog
312317

313-
def publish_workflow_experiment(self, write_to_file: bool = False):
318+
def generate_workflow_experiment_records(self, write_to_file: bool = False) -> None:
314319
"""prepare workflow and experiment as ogc api record to publish it to the
315320
specified GitHub repository."""
316321
workflow_id = self._normalize_name(self.workflow_config.get("workflow_id"))
@@ -328,16 +333,23 @@ def publish_workflow_experiment(self, write_to_file: bool = False):
328333
wf_record_properties = rg.build_record_properties(properties_list, contacts)
329334
# make a copy for experiment record
330335
exp_record_properties = copy.deepcopy(wf_record_properties)
336+
jupyter_kernel_info = wf_record_properties.jupyter_kernel_info.to_dict()
331337

332-
link_builder = LinksBuilder(osc_themes)
338+
link_builder = LinksBuilder(osc_themes, jupyter_kernel_info)
333339
theme_links = link_builder.build_theme_links_for_records()
340+
application_link = link_builder.build_link_to_jnb(
341+
self.workflow_title, jupyter_notebook_url
342+
)
343+
jnb_open_link = link_builder.make_related_link_for_opening_jnb_from_github(
344+
jupyter_notebook_url=jupyter_notebook_url
345+
)
334346

335347
workflow_record = WorkflowAsOgcRecord(
336348
id=workflow_id,
337349
type="Feature",
338350
title=self.workflow_title,
339351
properties=wf_record_properties,
340-
links=links + theme_links,
352+
links=links + theme_links + application_link + jnb_open_link,
341353
jupyter_notebook_url=jupyter_notebook_url,
342354
themes=osc_themes,
343355
)
@@ -347,21 +359,24 @@ def publish_workflow_experiment(self, write_to_file: bool = False):
347359
del workflow_dict["jupyter_notebook_url"]
348360
if "osc_workflow" in workflow_dict["properties"]:
349361
del workflow_dict["properties"]["osc_workflow"]
362+
# add workflow record to file_dict
350363
wf_file_path = f"workflows/{workflow_id}/record.json"
351364
file_dict = {wf_file_path: workflow_dict}
352365

353366
# Build properties for the experiment record
354367
exp_record_properties.type = "experiment"
355368
exp_record_properties.osc_workflow = workflow_id
356369

370+
dataset_link = link_builder.build_link_to_dataset(self.collection_id)
371+
357372
experiment_record = ExperimentAsOgcRecord(
358373
id=workflow_id,
359374
title=self.workflow_title,
360375
type="Feature",
361376
jupyter_notebook_url=jupyter_notebook_url,
362377
collection_id=self.collection_id,
363378
properties=exp_record_properties,
364-
links=links + theme_links,
379+
links=links + theme_links + dataset_link,
365380
)
366381
# Convert to dictionary and cleanup
367382
experiment_dict = experiment_record.to_dict()
@@ -371,6 +386,7 @@ def publish_workflow_experiment(self, write_to_file: bool = False):
371386
del experiment_dict["collection_id"]
372387
if "osc:project" in experiment_dict["properties"]:
373388
del experiment_dict["properties"]["osc:project"]
389+
# add experiment record to file_dict
374390
exp_file_path = f"experiments/{workflow_id}/record.json"
375391
file_dict[exp_file_path] = experiment_dict
376392

@@ -397,7 +413,9 @@ def publish_all(self, write_to_file: bool = False):
397413
"""Publish both dataset and workflow/experiment in a single PR."""
398414
# Get file dictionaries from both methods
399415
dataset_files = self.publish_dataset(write_to_file=write_to_file)
400-
workflow_files = self.publish_workflow_experiment(write_to_file=write_to_file)
416+
workflow_files = self.generate_workflow_experiment_records(
417+
write_to_file=write_to_file
418+
)
401419

402420
# Combine the file dictionaries
403421
combined_files = {**dataset_files, **workflow_files}

deep_code/utils/custom_xrlint_rules.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -71,7 +71,7 @@ def export_config() -> list:
7171
"content-desc": "off",
7272
"no-empty-attrs": "off",
7373
"conventions": "off",
74-
"time-coordinate": "off"
74+
"time-coordinate": "off",
7575
}
7676
},
7777
"deepcode/recommended",

deep_code/utils/dataset_stac_generator.py

Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -39,6 +39,9 @@ def __init__(
3939
self,
4040
dataset_id: str,
4141
collection_id: str,
42+
workflow_id: str,
43+
workflow_title: str,
44+
license_type: str,
4245
access_link: str | None = None,
4346
documentation_link: str | None = None,
4447
osc_status: str = "ongoing",
@@ -49,6 +52,9 @@ def __init__(
4952
):
5053
self.dataset_id = dataset_id
5154
self.collection_id = collection_id
55+
self.workflow_id = workflow_id
56+
self.workflow_title = workflow_title
57+
self.license_type = license_type
5258
self.access_link = access_link or f"s3://deep-esdl-public/{dataset_id}"
5359
self.documentation_link = documentation_link
5460
self.osc_status = osc_status
@@ -478,6 +484,17 @@ def build_dataset_stac_collection(self) -> Collection:
478484
)
479485
)
480486

487+
collection.add_link(
488+
Link(
489+
rel="related",
490+
target=f"../../experiments/{self.workflow_id}/record.json",
491+
media_type="application/json",
492+
title=f"Experiment: {self.workflow_title}",
493+
)
494+
)
495+
496+
collection.license = self.license_type
497+
481498
# Validate OSC extension fields
482499
try:
483500
osc_extension.validate_extension()

0 commit comments

Comments
 (0)