Skip to content

Commit c52432e

Browse files
authored
Refactor full glider metrics (#80)
* update pre-commits and skip new notebook * output the correct geometry * fix all lints * add pyobis to the package
1 parent 03c3442 commit c52432e

File tree

4 files changed

+112
-72
lines changed

4 files changed

+112
-72
lines changed

.pre-commit-config.yaml

Lines changed: 5 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1,9 +1,10 @@
11
exclude: |
22
(?x)^(
3-
notebooks/GTS_Totals_weather_act.ipynb|
4-
notebooks/IOOS_BTN.ipynb|
53
btn_metrics.py|
64
gts_atn_metrics.py|
5+
notebooks/GTS_Totals_weather_act.ipynb|
6+
notebooks/IOOS_BTN.ipynb|
7+
notebooks/mbon_citation_visualizations.ipynb|
78
read_bufr.py|
89
website/.*
910
)$
@@ -19,7 +20,7 @@ repos:
1920
- id: check-added-large-files
2021

2122
- repo: https://github.com/codespell-project/codespell
22-
rev: v2.2.6
23+
rev: v2.3.0
2324
hooks:
2425
- id: codespell
2526
exclude: >
@@ -35,7 +36,7 @@ repos:
3536
- id: add-trailing-comma
3637

3738
- repo: https://github.com/astral-sh/ruff-pre-commit
38-
rev: v0.4.3
39+
rev: v0.4.6
3940
hooks:
4041
- id: ruff
4142
args: ["--fix", "--show-fixes"]

ioos_metrics/ioos_metrics.py

Lines changed: 56 additions & 51 deletions
Original file line numberDiff line numberDiff line change
@@ -11,7 +11,7 @@
1111
from bs4 import BeautifulSoup
1212
from fake_useragent import UserAgent
1313
from gliderpy.fetchers import GliderDataFetcher
14-
from shapely.geometry import LineString
14+
from shapely.geometry import LineString, Point
1515

1616
from ioos_metrics.national_platforms import national_platforms
1717

@@ -206,6 +206,18 @@ def _metadata(info_df) -> dict:
206206
),
207207
}
208208

209+
def _make_track_geom(df) -> "pd.DataFrame":
210+
geom = Point if df.shape[0] == 1 else LineString
211+
212+
return geom(
213+
(lon, lat)
214+
for (lon, lat) in zip(
215+
df["longitude (degrees_east)"],
216+
df["latitude (degrees_north)"],
217+
strict=False,
218+
)
219+
)
220+
209221
def _computed_metadata(dataset_id) -> dict:
210222
"""Download the minimum amount of data possible for the computed
211223
metadata.
@@ -220,16 +232,20 @@ def _computed_metadata(dataset_id) -> dict:
220232
"longitude",
221233
"time",
222234
]
223-
df = glider_grab.to_pandas()
235+
df = glider_grab.fetcher.to_pandas(distinct=True)
236+
df["time (UTC)"] = pd.to_datetime(df["time (UTC)"])
237+
df = df.set_index("time (UTC)")
224238
df = df.sort_index()
239+
track = _make_track_geom(df)
225240
days = df.index[-1].ceil("D") - df.index[0].floor("D")
226241
return {
227-
"deployment_lat": df["latitude"].iloc[0],
228-
"deployment_lon": df["longitude"].iloc[0],
242+
"deployment_lat": df["latitude (degrees_north)"].iloc[0],
243+
"deployment_lon": df["longitude (degrees_east)"].iloc[0],
229244
"num_profiles": len(df),
230245
# Profiles are not unique! Cannot use this!!
231246
# "num_profiles": len(set(df['profile_id']))
232247
"days": days,
248+
"track": track,
233249
}
234250

235251
glider_grab = GliderDataFetcher()
@@ -245,21 +261,8 @@ def _computed_metadata(dataset_id) -> dict:
245261
)
246262

247263
metadata = {}
248-
glider_grab.fetcher.variables = ["longitude", "latitude"]
249264
for _, row in list(df.iterrows()):
250265
dataset_id = row["Dataset ID"]
251-
252-
glider_grab.fetcher.dataset_id = dataset_id
253-
track = glider_grab.fetcher.to_pandas(distinct=True)
254-
track = LineString(
255-
(lon, lat)
256-
for (lon, lat) in zip(
257-
track["longitude (degrees_east)"],
258-
track["latitude (degrees_north)"],
259-
strict=False,
260-
)
261-
)
262-
263266
info_url = row["info_url"].replace("html", "csv")
264267
info_df = pd.read_csv(info_url)
265268
info = _metadata(info_df)
@@ -271,7 +274,6 @@ def _computed_metadata(dataset_id) -> dict:
271274
"This could be a server side error and the metrics will be incomplete!",
272275
)
273276
continue
274-
info.update({"track": track})
275277
metadata.update({dataset_id: info})
276278
return pd.DataFrame(metadata).T
277279

@@ -554,68 +556,71 @@ def hf_radar_installations():
554556
# This is a hardcoded number at the moment!
555557
return 165
556558

559+
557560
@functools.lru_cache(maxsize=128)
558561
def mbon_stats():
559-
"""
560-
This function collects download statistics about MBON affiliated datasets shared with the Ocean Biodiversity
562+
"""Collects download statistics about MBON affiliated datasets shared with the Ocean Biodiversity
561563
Information System (OBIS) and the Global Biodiversity Information Framework (GBIF). The function returns a
562564
dataframe with rows corresponding to each paper citing a dataset.
563565
"""
564-
import pyobis
565566
import urllib.parse
566567

568+
import pyobis
569+
567570
# collect dataset information from OBIS
568571
institution_id = 23070
569572
query = pyobis.dataset.search(instituteid=institution_id)
570573
df = pd.DataFrame(query.execute())
571574
df_obis = pd.DataFrame.from_records(df["results"])
572-
df_obis.columns = [f'obis_{col}' for col in df_obis.columns]
575+
df_obis.columns = [f"obis_{col}" for col in df_obis.columns]
573576

574577
df_mapping = pd.DataFrame()
575-
base_url = 'https://api.gbif.org'
578+
base_url = "https://api.gbif.org"
576579
# iterate through each OBIS dataset to gather uuid from GBIF
577580
# create a mapping table
578-
for title in df_obis['obis_title']:
581+
for title in df_obis["obis_title"]:
579582
string = title
580-
query = f'{base_url}/v1/dataset/search?q={urllib.parse.quote(string)}'
581-
df = pd.read_json(query, orient='index').T
583+
query = f"{base_url}/v1/dataset/search?q={urllib.parse.quote(string)}"
584+
df = pd.read_json(query, orient="index").T
582585

583586
# build a DataFrame with the info we need more accessible
584-
df_mapping = pd.concat([df_mapping, pd.DataFrame({
585-
'gbif_uuid': df['results'].values[0][0]['key'],
586-
'title': [df['results'].values[0][0]['title']],
587-
'obis_id': [df_obis.loc[df_obis['obis_title']==title,'obis_id'].to_string(index=False)],
588-
'doi': [df['results'].values[0][0]['doi']]
589-
})], ignore_index=True)
590-
587+
df_mapping = pd.concat(
588+
[
589+
df_mapping,
590+
pd.DataFrame(
591+
{
592+
"gbif_uuid": df["results"].to_numpy()[0][0]["key"],
593+
"title": [df["results"].to_numpy()[0][0]["title"]],
594+
"obis_id": [df_obis.loc[df_obis["obis_title"] == title, "obis_id"].to_string(index=False)],
595+
"doi": [df["results"].to_numpy()[0][0]["doi"]],
596+
},
597+
),
598+
],
599+
ignore_index=True,
600+
)
591601

592602
df_gbif = pd.DataFrame()
593-
for key in df_mapping['gbif_uuid']:
594-
595-
url = 'https://api.gbif.org/v1/literature/export?format=CSV&gbifDatasetKey={}'.format(key)
596-
df2 = pd.read_csv(url) # collect liturature cited information
597-
df2.columns = ['literature_' + str(col) for col in df2.columns]
598-
df2['gbif_uuid'] = key
603+
for key in df_mapping["gbif_uuid"]:
604+
url = f"https://api.gbif.org/v1/literature/export?format=CSV&gbifDatasetKey={key}"
605+
df2 = pd.read_csv(url) # collect literature cited information
606+
df2.columns = ["literature_" + str(col) for col in df2.columns]
607+
df2["gbif_uuid"] = key
599608

600-
df_gbif = pd.concat([df2,df_gbif], ignore_index=True)
609+
df_gbif = pd.concat([df2, df_gbif], ignore_index=True)
601610

602611
# merge the OBIS and GBIF data frames together
603-
df_obis = df_obis.merge(df_mapping, on='obis_id')
612+
df_obis = df_obis.merge(df_mapping, on="obis_id")
604613

605614
# add gbif download stats
606615

607-
for key in df_obis['gbif_uuid']:
608-
url = f'https://api.gbif.org/v1/occurrence/download/statistics/export?datasetKey={key}'
609-
df2 = pd.read_csv(url,sep='\t')
610-
df2_group = df2.groupby('year').agg({'number_downloads':'sum'})
611-
612-
df_obis.loc[df_obis['gbif_uuid']==key,'gbif_downloads'] = str(df2_group.to_dict())
613-
614-
df_out = df_gbif.merge(df_obis, on='gbif_uuid')
615-
616-
return df_out
616+
for key in df_obis["gbif_uuid"]:
617+
url = f"https://api.gbif.org/v1/occurrence/download/statistics/export?datasetKey={key}"
618+
df2 = pd.read_csv(url, sep="\t")
619+
df2_group = df2.groupby("year").agg({"number_downloads": "sum"})
617620

621+
df_obis.loc[df_obis["gbif_uuid"] == key, "gbif_downloads"] = str(df2_group.to_dict())
618622

623+
return df_gbif.merge(df_obis, on="gbif_uuid")
619624

620625

621626
def update_metrics(*, debug=False):

pyproject.toml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -36,6 +36,7 @@ dependencies = [
3636
"pandas",
3737
"pdfminer.six",
3838
"pyarrow",
39+
"pyobis",
3940
"requests",
4041
]
4142
[project.urls]

tests/test_metrics.py

Lines changed: 50 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -74,24 +74,57 @@ def test_update_metrics():
7474
df = update_metrics(debug=True)
7575
df.to_csv("updated_metrics.csv")
7676

77+
7778
def test_mbon_stats():
7879
df = ioos_metrics.mbon_stats()
79-
columns = ['literature_title', 'literature_authors', 'literature_source',
80-
'literature_discovered', 'literature_published',
81-
'literature_open_access', 'literature_peer_review',
82-
'literature_citation_type', 'literature_countries_of_coverage',
83-
'literature_countries_of_researcher', 'literature_keywords',
84-
'literature_literature_type', 'literature_websites',
85-
'literature_identifiers', 'literature_id', 'literature_abstract',
86-
'literature_topics', 'literature_added', 'literature_gbif_download_key',
87-
'gbif_uuid', 'obis_id', 'obis_url', 'obis_archive', 'obis_published',
88-
'obis_created', 'obis_updated', 'obis_core', 'obis_extensions',
89-
'obis_statistics', 'obis_extent', 'obis_title', 'obis_citation',
90-
'obis_citation_id', 'obis_abstract', 'obis_intellectualrights',
91-
'obis_feed', 'obis_institutes', 'obis_contacts', 'obis_nodes',
92-
'obis_keywords', 'obis_downloads', 'obis_records', 'title', 'doi',
93-
'gbif_downloads']
80+
columns = [
81+
"literature_title",
82+
"literature_authors",
83+
"literature_source",
84+
"literature_discovered",
85+
"literature_published",
86+
"literature_open_access",
87+
"literature_peer_review",
88+
"literature_citation_type",
89+
"literature_countries_of_coverage",
90+
"literature_countries_of_researcher",
91+
"literature_keywords",
92+
"literature_literature_type",
93+
"literature_websites",
94+
"literature_identifiers",
95+
"literature_id",
96+
"literature_abstract",
97+
"literature_topics",
98+
"literature_added",
99+
"literature_gbif_download_key",
100+
"gbif_uuid",
101+
"obis_id",
102+
"obis_url",
103+
"obis_archive",
104+
"obis_published",
105+
"obis_created",
106+
"obis_updated",
107+
"obis_core",
108+
"obis_extensions",
109+
"obis_statistics",
110+
"obis_extent",
111+
"obis_title",
112+
"obis_citation",
113+
"obis_citation_id",
114+
"obis_abstract",
115+
"obis_intellectualrights",
116+
"obis_feed",
117+
"obis_institutes",
118+
"obis_contacts",
119+
"obis_nodes",
120+
"obis_keywords",
121+
"obis_downloads",
122+
"obis_records",
123+
"title",
124+
"doi",
125+
"gbif_downloads",
126+
]
94127

95128
assert isinstance(df, pd.DataFrame)
96-
assert all([col in df.columns for col in columns])
97-
assert not df.empty
129+
assert all(col in df.columns for col in columns)
130+
assert not df.empty

0 commit comments

Comments
 (0)