improve format and dataset name handling

CarsonDavis · CarsonDavis · commit 1e27e284df1d · 2024-11-20T23:16:34.000-06:00
diff --git a/scripts/ej/cmr_processing.py b/scripts/ej/cmr_processing.py
@@ -27,7 +27,7 @@ class DownloadInfo(NamedTuple):
     has_distribution: bool
     has_direct_download: bool
     visualization_urls: list[str]
-    format: str
+    formats: list[str]  # Changed from single format to list of formats
 
 
 class ProcessingInfo(NamedTuple):
@@ -190,7 +190,17 @@ def _process_download_info(self) -> DownloadInfo:
         has_distribution = False
         has_direct_download = False
         visualization_urls = []
+        formats = []
 
+        # Extract formats from FileDistributionInformation
+        archive_info = self.umm.get("ArchiveAndDistributionInformation", {})
+        distribution_info = archive_info.get("FileDistributionInformation", [])
+
+        for info in distribution_info:
+            if "Format" in info:
+                formats.append(info["Format"])
+
+        # Process RelatedUrls
         related_urls = self.umm.get("RelatedUrls", [])
         for url in related_urls:
             if url.get("URLContentType") == "DistributionURL" and url.get("Type") == "GET DATA":
@@ -204,9 +214,14 @@ def _process_download_info(self) -> DownloadInfo:
             has_distribution=has_distribution,
             has_direct_download=has_direct_download,
             visualization_urls=visualization_urls,
-            format=self.meta.get("format", ""),
+            formats=formats,
         )
 
+    @property
+    def format(self) -> str:
+        """Get dataset formats as semicolon-separated string."""
+        return "; ".join(self.download_info.formats) if self.download_info.formats else ""
+
     def _process_processing_info(self) -> ProcessingInfo:
         """Process all processing level information."""
         processing_level = self.umm.get("ProcessingLevel", {}).get("Id", "")
@@ -331,8 +346,8 @@ def projects(self) -> str:
 
     @property
     def dataset_name(self) -> str:
-        """Get dataset short name."""
-        return self.umm.get("ShortName", "")
+        """Get dataset entry title or shortname."""
+        return self.umm.get("EntryTitle", self.umm.get("ShortName", ""))
 
     @property
     def description(self) -> str:
@@ -344,11 +359,6 @@ def limitations(self) -> str:
         """Get dataset access constraints."""
         return self.umm.get("AccessConstraints", {}).get("Description", "")
 
-    @property
-    def format(self) -> str:
-        """Get dataset format."""
-        return self.download_info.format
-
     @property
     def temporal_extent(self) -> str:
         """Get dataset temporal extent."""
diff --git a/scripts/ej/test_cmr_processing.py b/scripts/ej/test_cmr_processing.py
@@ -21,10 +21,10 @@ def cmr_dataset(self):
     def test_full_dataset_processing(self, cmr_dataset):
         """Test that all properties can be extracted from real data without errors"""
         # Test all property accessors
-        assert cmr_dataset.dataset_name == "CIESIN_SEDAC_ESI_2000"
+        assert cmr_dataset.dataset_name == "2000 Pilot Environmental Sustainability Index (ESI)"
         assert cmr_dataset.description.startswith("The 2000 Pilot Environmental Sustainability Index")
         assert cmr_dataset.limitations == "None"
-        assert cmr_dataset.format == "application/vnd.nasa.cmr.umm+json"
+        assert cmr_dataset.format == "PDF"
         assert cmr_dataset.temporal_extent == ""  # No SingleDateTimes in example
         assert cmr_dataset.intended_use == "Path A"  # ProcessingLevel is 4
         assert cmr_dataset.source_link == "https://doi.org/10.7927/H4NK3BZJ"
@@ -337,6 +337,47 @@ def test_visualization_urls(self):
         assert "http://example.com/viz1" in dataset.data_visualization
         assert "http://example.com/viz2" in dataset.data_visualization
 
+    def test_format_extraction_single(self):
+        data = {
+            "umm": {
+                "ArchiveAndDistributionInformation": {
+                    "FileDistributionInformation": [{"Format": "GeoTIFF", "Fees": "0"}]
+                }
+            }
+        }
+        dataset = CmrDataset(data)
+        assert dataset.format == "GeoTIFF"
+
+    def test_format_extraction_multiple(self):
+        data = {
+            "umm": {
+                "ArchiveAndDistributionInformation": {
+                    "FileDistributionInformation": [
+                        {"Format": "Excel", "Fees": "0"},
+                        {"Format": "PDF", "Fees": "0"},
+                        {"Format": "PNG", "Fees": "0"},
+                    ]
+                }
+            }
+        }
+        dataset = CmrDataset(data)
+        assert dataset.format == "Excel; PDF; PNG"
+
+    def test_format_extraction_empty(self):
+        data = {"umm": {"ArchiveAndDistributionInformation": {"FileDistributionInformation": []}}}
+        dataset = CmrDataset(data)
+        assert dataset.format == ""
+
+    def test_format_extraction_missing_info(self):
+        data = {"umm": {"ArchiveAndDistributionInformation": {}}}
+        dataset = CmrDataset(data)
+        assert dataset.format == ""
+
+    def test_format_extraction_no_archive_info(self):
+        data = {"umm": {}}
+        dataset = CmrDataset(data)
+        assert dataset.format == ""
+
 
 class TestProcessingLevelInfo:
     """Unit tests for processing level information"""