Refactor full glider metrics (#80)

ocefpaf · web-flow · commit c52432e2be79 · 2024-05-30T09:25:10.000-04:00
* update pre-commits and skip new notebook

* output the correct geometry

* fix all lints

* add pyobis to the package
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
@@ -1,9 +1,10 @@
 exclude: |
     (?x)^(
-        notebooks/GTS_Totals_weather_act.ipynb|
-        notebooks/IOOS_BTN.ipynb|
         btn_metrics.py|
         gts_atn_metrics.py|
+        notebooks/GTS_Totals_weather_act.ipynb|
+        notebooks/IOOS_BTN.ipynb|
+        notebooks/mbon_citation_visualizations.ipynb|
         read_bufr.py|
         website/.*
     )$
@@ -19,7 +20,7 @@ repos:
     - id: check-added-large-files
 
 - repo: https://github.com/codespell-project/codespell
-  rev: v2.2.6
+  rev: v2.3.0
   hooks:
     - id: codespell
       exclude: >
@@ -35,7 +36,7 @@ repos:
     - id: add-trailing-comma
 
 - repo: https://github.com/astral-sh/ruff-pre-commit
-  rev: v0.4.3
+  rev: v0.4.6
   hooks:
     - id: ruff
       args: ["--fix", "--show-fixes"]
diff --git a/ioos_metrics/ioos_metrics.py b/ioos_metrics/ioos_metrics.py
@@ -11,7 +11,7 @@
 from bs4 import BeautifulSoup
 from fake_useragent import UserAgent
 from gliderpy.fetchers import GliderDataFetcher
-from shapely.geometry import LineString
+from shapely.geometry import LineString, Point
 
 from ioos_metrics.national_platforms import national_platforms
 
@@ -206,6 +206,18 @@ def _metadata(info_df) -> dict:
             ),
         }
 
+    def _make_track_geom(df) -> "pd.DataFrame":
+        geom = Point if df.shape[0] == 1 else LineString
+
+        return geom(
+            (lon, lat)
+            for (lon, lat) in zip(
+                df["longitude (degrees_east)"],
+                df["latitude (degrees_north)"],
+                strict=False,
+            )
+        )
+
     def _computed_metadata(dataset_id) -> dict:
         """Download the minimum amount of data possible for the computed
         metadata.
@@ -220,16 +232,20 @@ def _computed_metadata(dataset_id) -> dict:
             "longitude",
             "time",
         ]
-        df = glider_grab.to_pandas()
+        df = glider_grab.fetcher.to_pandas(distinct=True)
+        df["time (UTC)"] = pd.to_datetime(df["time (UTC)"])
+        df = df.set_index("time (UTC)")
         df = df.sort_index()
+        track = _make_track_geom(df)
         days = df.index[-1].ceil("D") - df.index[0].floor("D")
         return {
-            "deployment_lat": df["latitude"].iloc[0],
-            "deployment_lon": df["longitude"].iloc[0],
+            "deployment_lat": df["latitude (degrees_north)"].iloc[0],
+            "deployment_lon": df["longitude (degrees_east)"].iloc[0],
             "num_profiles": len(df),
             # Profiles are not unique! Cannot use this!!
             # "num_profiles": len(set(df['profile_id']))
             "days": days,
+            "track": track,
         }
 
     glider_grab = GliderDataFetcher()
@@ -245,21 +261,8 @@ def _computed_metadata(dataset_id) -> dict:
     )
 
     metadata = {}
-    glider_grab.fetcher.variables = ["longitude", "latitude"]
     for _, row in list(df.iterrows()):
         dataset_id = row["Dataset ID"]
-
-        glider_grab.fetcher.dataset_id = dataset_id
-        track = glider_grab.fetcher.to_pandas(distinct=True)
-        track = LineString(
-            (lon, lat)
-            for (lon, lat) in zip(
-                track["longitude (degrees_east)"],
-                track["latitude (degrees_north)"],
-                strict=False,
-            )
-        )
-
         info_url = row["info_url"].replace("html", "csv")
         info_df = pd.read_csv(info_url)
         info = _metadata(info_df)
@@ -271,7 +274,6 @@ def _computed_metadata(dataset_id) -> dict:
                 "This could be a server side error and the metrics will be incomplete!",
             )
             continue
-        info.update({"track": track})
         metadata.update({dataset_id: info})
     return pd.DataFrame(metadata).T
 
@@ -554,68 +556,71 @@ def hf_radar_installations():
     # This is a hardcoded number at the moment!
     return 165
 
+
 @functools.lru_cache(maxsize=128)
 def mbon_stats():
-    """
-    This function collects download statistics about MBON affiliated datasets shared with the Ocean Biodiversity
+    """Collects download statistics about MBON affiliated datasets shared with the Ocean Biodiversity
     Information System (OBIS) and the Global Biodiversity Information Framework (GBIF). The function returns a
     dataframe with rows corresponding to each paper citing a dataset.
     """
-    import pyobis
     import urllib.parse
 
+    import pyobis
+
     # collect dataset information from OBIS
     institution_id = 23070
     query = pyobis.dataset.search(instituteid=institution_id)
     df = pd.DataFrame(query.execute())
     df_obis = pd.DataFrame.from_records(df["results"])
-    df_obis.columns = [f'obis_{col}' for col in df_obis.columns]
+    df_obis.columns = [f"obis_{col}" for col in df_obis.columns]
 
     df_mapping = pd.DataFrame()
-    base_url = 'https://api.gbif.org'
+    base_url = "https://api.gbif.org"
     # iterate through each OBIS dataset to gather uuid from GBIF
     # create a mapping table
-    for title in df_obis['obis_title']:
+    for title in df_obis["obis_title"]:
         string = title
-        query = f'{base_url}/v1/dataset/search?q={urllib.parse.quote(string)}'
-        df = pd.read_json(query, orient='index').T
+        query = f"{base_url}/v1/dataset/search?q={urllib.parse.quote(string)}"
+        df = pd.read_json(query, orient="index").T
 
         # build a DataFrame with the info we need more accessible
-        df_mapping = pd.concat([df_mapping, pd.DataFrame({
-            'gbif_uuid': df['results'].values[0][0]['key'],
-            'title': [df['results'].values[0][0]['title']],
-            'obis_id': [df_obis.loc[df_obis['obis_title']==title,'obis_id'].to_string(index=False)],
-            'doi': [df['results'].values[0][0]['doi']]
-        })], ignore_index=True)
-
+        df_mapping = pd.concat(
+            [
+                df_mapping,
+                pd.DataFrame(
+                    {
+                        "gbif_uuid": df["results"].to_numpy()[0][0]["key"],
+                        "title": [df["results"].to_numpy()[0][0]["title"]],
+                        "obis_id": [df_obis.loc[df_obis["obis_title"] == title, "obis_id"].to_string(index=False)],
+                        "doi": [df["results"].to_numpy()[0][0]["doi"]],
+                    },
+                ),
+            ],
+            ignore_index=True,
+        )
 
     df_gbif = pd.DataFrame()
-    for key in df_mapping['gbif_uuid']:
-
-        url = 'https://api.gbif.org/v1/literature/export?format=CSV&gbifDatasetKey={}'.format(key)
-        df2 = pd.read_csv(url)  # collect liturature cited information
-        df2.columns = ['literature_' + str(col) for col in df2.columns]
-        df2['gbif_uuid'] = key
+    for key in df_mapping["gbif_uuid"]:
+        url = f"https://api.gbif.org/v1/literature/export?format=CSV&gbifDatasetKey={key}"
+        df2 = pd.read_csv(url)  # collect literature cited information
+        df2.columns = ["literature_" + str(col) for col in df2.columns]
+        df2["gbif_uuid"] = key
 
-        df_gbif = pd.concat([df2,df_gbif], ignore_index=True)
+        df_gbif = pd.concat([df2, df_gbif], ignore_index=True)
 
     # merge the OBIS and GBIF data frames together
-    df_obis = df_obis.merge(df_mapping, on='obis_id')
+    df_obis = df_obis.merge(df_mapping, on="obis_id")
 
     # add gbif download stats
 
-    for key in df_obis['gbif_uuid']:
-        url = f'https://api.gbif.org/v1/occurrence/download/statistics/export?datasetKey={key}'
-        df2 = pd.read_csv(url,sep='\t')
-        df2_group = df2.groupby('year').agg({'number_downloads':'sum'})
-
-        df_obis.loc[df_obis['gbif_uuid']==key,'gbif_downloads'] = str(df2_group.to_dict())
-
-    df_out = df_gbif.merge(df_obis, on='gbif_uuid')
-
-    return df_out
+    for key in df_obis["gbif_uuid"]:
+        url = f"https://api.gbif.org/v1/occurrence/download/statistics/export?datasetKey={key}"
+        df2 = pd.read_csv(url, sep="\t")
+        df2_group = df2.groupby("year").agg({"number_downloads": "sum"})
 
+        df_obis.loc[df_obis["gbif_uuid"] == key, "gbif_downloads"] = str(df2_group.to_dict())
 
+    return df_gbif.merge(df_obis, on="gbif_uuid")
 
 
 def update_metrics(*, debug=False):
diff --git a/pyproject.toml b/pyproject.toml
@@ -36,6 +36,7 @@ dependencies = [
   "pandas",
   "pdfminer.six",
   "pyarrow",
+  "pyobis",
   "requests",
 ]
 [project.urls]
diff --git a/tests/test_metrics.py b/tests/test_metrics.py
@@ -74,24 +74,57 @@ def test_update_metrics():
     df = update_metrics(debug=True)
     df.to_csv("updated_metrics.csv")
 
+
 def test_mbon_stats():
     df = ioos_metrics.mbon_stats()
-    columns = ['literature_title', 'literature_authors', 'literature_source',
-               'literature_discovered', 'literature_published',
-               'literature_open_access', 'literature_peer_review',
-               'literature_citation_type', 'literature_countries_of_coverage',
-               'literature_countries_of_researcher', 'literature_keywords',
-               'literature_literature_type', 'literature_websites',
-               'literature_identifiers', 'literature_id', 'literature_abstract',
-               'literature_topics', 'literature_added', 'literature_gbif_download_key',
-               'gbif_uuid', 'obis_id', 'obis_url', 'obis_archive', 'obis_published',
-               'obis_created', 'obis_updated', 'obis_core', 'obis_extensions',
-               'obis_statistics', 'obis_extent', 'obis_title', 'obis_citation',
-               'obis_citation_id', 'obis_abstract', 'obis_intellectualrights',
-               'obis_feed', 'obis_institutes', 'obis_contacts', 'obis_nodes',
-               'obis_keywords', 'obis_downloads', 'obis_records', 'title', 'doi',
-               'gbif_downloads']
+    columns = [
+        "literature_title",
+        "literature_authors",
+        "literature_source",
+        "literature_discovered",
+        "literature_published",
+        "literature_open_access",
+        "literature_peer_review",
+        "literature_citation_type",
+        "literature_countries_of_coverage",
+        "literature_countries_of_researcher",
+        "literature_keywords",
+        "literature_literature_type",
+        "literature_websites",
+        "literature_identifiers",
+        "literature_id",
+        "literature_abstract",
+        "literature_topics",
+        "literature_added",
+        "literature_gbif_download_key",
+        "gbif_uuid",
+        "obis_id",
+        "obis_url",
+        "obis_archive",
+        "obis_published",
+        "obis_created",
+        "obis_updated",
+        "obis_core",
+        "obis_extensions",
+        "obis_statistics",
+        "obis_extent",
+        "obis_title",
+        "obis_citation",
+        "obis_citation_id",
+        "obis_abstract",
+        "obis_intellectualrights",
+        "obis_feed",
+        "obis_institutes",
+        "obis_contacts",
+        "obis_nodes",
+        "obis_keywords",
+        "obis_downloads",
+        "obis_records",
+        "title",
+        "doi",
+        "gbif_downloads",
+    ]
 
     assert isinstance(df, pd.DataFrame)
-    assert all([col in df.columns for col in columns])
-    assert not df.empty
+    assert all(col in df.columns for col in columns)
+    assert not df.empty

Original file line number	Diff line number	Diff line change
`@@ -36,6 +36,7 @@ dependencies = [`
`36`	`36`	`"pandas",`
`37`	`37`	`"pdfminer.six",`
`38`	`38`	`"pyarrow",`
	`39`	`+ "pyobis",`
`39`	`40`	`"requests",`
`40`	`41`	`]`
`41`	`42`	`[project.urls]`