Skip to content

Commit 1e27e28

Browse files
committed
improve format and dataset name handling
1 parent 403c71d commit 1e27e28

File tree

2 files changed

+62
-11
lines changed

2 files changed

+62
-11
lines changed

scripts/ej/cmr_processing.py

Lines changed: 19 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -27,7 +27,7 @@ class DownloadInfo(NamedTuple):
2727
has_distribution: bool
2828
has_direct_download: bool
2929
visualization_urls: list[str]
30-
format: str
30+
formats: list[str] # Changed from single format to list of formats
3131

3232

3333
class ProcessingInfo(NamedTuple):
@@ -190,7 +190,17 @@ def _process_download_info(self) -> DownloadInfo:
190190
has_distribution = False
191191
has_direct_download = False
192192
visualization_urls = []
193+
formats = []
193194

195+
# Extract formats from FileDistributionInformation
196+
archive_info = self.umm.get("ArchiveAndDistributionInformation", {})
197+
distribution_info = archive_info.get("FileDistributionInformation", [])
198+
199+
for info in distribution_info:
200+
if "Format" in info:
201+
formats.append(info["Format"])
202+
203+
# Process RelatedUrls
194204
related_urls = self.umm.get("RelatedUrls", [])
195205
for url in related_urls:
196206
if url.get("URLContentType") == "DistributionURL" and url.get("Type") == "GET DATA":
@@ -204,9 +214,14 @@ def _process_download_info(self) -> DownloadInfo:
204214
has_distribution=has_distribution,
205215
has_direct_download=has_direct_download,
206216
visualization_urls=visualization_urls,
207-
format=self.meta.get("format", ""),
217+
formats=formats,
208218
)
209219

220+
@property
221+
def format(self) -> str:
222+
"""Get dataset formats as semicolon-separated string."""
223+
return "; ".join(self.download_info.formats) if self.download_info.formats else ""
224+
210225
def _process_processing_info(self) -> ProcessingInfo:
211226
"""Process all processing level information."""
212227
processing_level = self.umm.get("ProcessingLevel", {}).get("Id", "")
@@ -331,8 +346,8 @@ def projects(self) -> str:
331346

332347
@property
333348
def dataset_name(self) -> str:
334-
"""Get dataset short name."""
335-
return self.umm.get("ShortName", "")
349+
"""Get dataset entry title or shortname."""
350+
return self.umm.get("EntryTitle", self.umm.get("ShortName", ""))
336351

337352
@property
338353
def description(self) -> str:
@@ -344,11 +359,6 @@ def limitations(self) -> str:
344359
"""Get dataset access constraints."""
345360
return self.umm.get("AccessConstraints", {}).get("Description", "")
346361

347-
@property
348-
def format(self) -> str:
349-
"""Get dataset format."""
350-
return self.download_info.format
351-
352362
@property
353363
def temporal_extent(self) -> str:
354364
"""Get dataset temporal extent."""

scripts/ej/test_cmr_processing.py

Lines changed: 43 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -21,10 +21,10 @@ def cmr_dataset(self):
2121
def test_full_dataset_processing(self, cmr_dataset):
2222
"""Test that all properties can be extracted from real data without errors"""
2323
# Test all property accessors
24-
assert cmr_dataset.dataset_name == "CIESIN_SEDAC_ESI_2000"
24+
assert cmr_dataset.dataset_name == "2000 Pilot Environmental Sustainability Index (ESI)"
2525
assert cmr_dataset.description.startswith("The 2000 Pilot Environmental Sustainability Index")
2626
assert cmr_dataset.limitations == "None"
27-
assert cmr_dataset.format == "application/vnd.nasa.cmr.umm+json"
27+
assert cmr_dataset.format == "PDF"
2828
assert cmr_dataset.temporal_extent == "" # No SingleDateTimes in example
2929
assert cmr_dataset.intended_use == "Path A" # ProcessingLevel is 4
3030
assert cmr_dataset.source_link == "https://doi.org/10.7927/H4NK3BZJ"
@@ -337,6 +337,47 @@ def test_visualization_urls(self):
337337
assert "http://example.com/viz1" in dataset.data_visualization
338338
assert "http://example.com/viz2" in dataset.data_visualization
339339

340+
def test_format_extraction_single(self):
341+
data = {
342+
"umm": {
343+
"ArchiveAndDistributionInformation": {
344+
"FileDistributionInformation": [{"Format": "GeoTIFF", "Fees": "0"}]
345+
}
346+
}
347+
}
348+
dataset = CmrDataset(data)
349+
assert dataset.format == "GeoTIFF"
350+
351+
def test_format_extraction_multiple(self):
352+
data = {
353+
"umm": {
354+
"ArchiveAndDistributionInformation": {
355+
"FileDistributionInformation": [
356+
{"Format": "Excel", "Fees": "0"},
357+
{"Format": "PDF", "Fees": "0"},
358+
{"Format": "PNG", "Fees": "0"},
359+
]
360+
}
361+
}
362+
}
363+
dataset = CmrDataset(data)
364+
assert dataset.format == "Excel; PDF; PNG"
365+
366+
def test_format_extraction_empty(self):
367+
data = {"umm": {"ArchiveAndDistributionInformation": {"FileDistributionInformation": []}}}
368+
dataset = CmrDataset(data)
369+
assert dataset.format == ""
370+
371+
def test_format_extraction_missing_info(self):
372+
data = {"umm": {"ArchiveAndDistributionInformation": {}}}
373+
dataset = CmrDataset(data)
374+
assert dataset.format == ""
375+
376+
def test_format_extraction_no_archive_info(self):
377+
data = {"umm": {}}
378+
dataset = CmrDataset(data)
379+
assert dataset.format == ""
380+
340381

341382
class TestProcessingLevelInfo:
342383
"""Unit tests for processing level information"""

0 commit comments

Comments
 (0)