1111from bs4 import BeautifulSoup
1212from fake_useragent import UserAgent
1313from gliderpy .fetchers import GliderDataFetcher
14- from shapely .geometry import LineString
14+ from shapely .geometry import LineString , Point
1515
1616from ioos_metrics .national_platforms import national_platforms
1717
@@ -206,6 +206,18 @@ def _metadata(info_df) -> dict:
206206 ),
207207 }
208208
209+ def _make_track_geom (df ) -> "pd.DataFrame" :
210+ geom = Point if df .shape [0 ] == 1 else LineString
211+
212+ return geom (
213+ (lon , lat )
214+ for (lon , lat ) in zip (
215+ df ["longitude (degrees_east)" ],
216+ df ["latitude (degrees_north)" ],
217+ strict = False ,
218+ )
219+ )
220+
209221 def _computed_metadata (dataset_id ) -> dict :
210222 """Download the minimum amount of data possible for the computed
211223 metadata.
@@ -220,16 +232,20 @@ def _computed_metadata(dataset_id) -> dict:
220232 "longitude" ,
221233 "time" ,
222234 ]
223- df = glider_grab .to_pandas ()
235+ df = glider_grab .fetcher .to_pandas (distinct = True )
236+ df ["time (UTC)" ] = pd .to_datetime (df ["time (UTC)" ])
237+ df = df .set_index ("time (UTC)" )
224238 df = df .sort_index ()
239+ track = _make_track_geom (df )
225240 days = df .index [- 1 ].ceil ("D" ) - df .index [0 ].floor ("D" )
226241 return {
227- "deployment_lat" : df ["latitude" ].iloc [0 ],
228- "deployment_lon" : df ["longitude" ].iloc [0 ],
242+ "deployment_lat" : df ["latitude (degrees_north) " ].iloc [0 ],
243+ "deployment_lon" : df ["longitude (degrees_east) " ].iloc [0 ],
229244 "num_profiles" : len (df ),
230245 # Profiles are not unique! Cannot use this!!
231246 # "num_profiles": len(set(df['profile_id']))
232247 "days" : days ,
248+ "track" : track ,
233249 }
234250
235251 glider_grab = GliderDataFetcher ()
@@ -245,21 +261,8 @@ def _computed_metadata(dataset_id) -> dict:
245261 )
246262
247263 metadata = {}
248- glider_grab .fetcher .variables = ["longitude" , "latitude" ]
249264 for _ , row in list (df .iterrows ()):
250265 dataset_id = row ["Dataset ID" ]
251-
252- glider_grab .fetcher .dataset_id = dataset_id
253- track = glider_grab .fetcher .to_pandas (distinct = True )
254- track = LineString (
255- (lon , lat )
256- for (lon , lat ) in zip (
257- track ["longitude (degrees_east)" ],
258- track ["latitude (degrees_north)" ],
259- strict = False ,
260- )
261- )
262-
263266 info_url = row ["info_url" ].replace ("html" , "csv" )
264267 info_df = pd .read_csv (info_url )
265268 info = _metadata (info_df )
@@ -271,7 +274,6 @@ def _computed_metadata(dataset_id) -> dict:
271274 "This could be a server side error and the metrics will be incomplete!" ,
272275 )
273276 continue
274- info .update ({"track" : track })
275277 metadata .update ({dataset_id : info })
276278 return pd .DataFrame (metadata ).T
277279
@@ -554,68 +556,71 @@ def hf_radar_installations():
554556 # This is a hardcoded number at the moment!
555557 return 165
556558
559+
557560@functools .lru_cache (maxsize = 128 )
558561def mbon_stats ():
559- """
560- This function collects download statistics about MBON affiliated datasets shared with the Ocean Biodiversity
562+ """Collects download statistics about MBON affiliated datasets shared with the Ocean Biodiversity
561563 Information System (OBIS) and the Global Biodiversity Information Framework (GBIF). The function returns a
562564 dataframe with rows corresponding to each paper citing a dataset.
563565 """
564- import pyobis
565566 import urllib .parse
566567
568+ import pyobis
569+
567570 # collect dataset information from OBIS
568571 institution_id = 23070
569572 query = pyobis .dataset .search (instituteid = institution_id )
570573 df = pd .DataFrame (query .execute ())
571574 df_obis = pd .DataFrame .from_records (df ["results" ])
572- df_obis .columns = [f' obis_{ col } ' for col in df_obis .columns ]
575+ df_obis .columns = [f" obis_{ col } " for col in df_obis .columns ]
573576
574577 df_mapping = pd .DataFrame ()
575- base_url = ' https://api.gbif.org'
578+ base_url = " https://api.gbif.org"
576579 # iterate through each OBIS dataset to gather uuid from GBIF
577580 # create a mapping table
578- for title in df_obis [' obis_title' ]:
581+ for title in df_obis [" obis_title" ]:
579582 string = title
580- query = f' { base_url } /v1/dataset/search?q={ urllib .parse .quote (string )} '
581- df = pd .read_json (query , orient = ' index' ).T
583+ query = f" { base_url } /v1/dataset/search?q={ urllib .parse .quote (string )} "
584+ df = pd .read_json (query , orient = " index" ).T
582585
583586 # build a DataFrame with the info we need more accessible
584- df_mapping = pd .concat ([df_mapping , pd .DataFrame ({
585- 'gbif_uuid' : df ['results' ].values [0 ][0 ]['key' ],
586- 'title' : [df ['results' ].values [0 ][0 ]['title' ]],
587- 'obis_id' : [df_obis .loc [df_obis ['obis_title' ]== title ,'obis_id' ].to_string (index = False )],
588- 'doi' : [df ['results' ].values [0 ][0 ]['doi' ]]
589- })], ignore_index = True )
590-
587+ df_mapping = pd .concat (
588+ [
589+ df_mapping ,
590+ pd .DataFrame (
591+ {
592+ "gbif_uuid" : df ["results" ].to_numpy ()[0 ][0 ]["key" ],
593+ "title" : [df ["results" ].to_numpy ()[0 ][0 ]["title" ]],
594+ "obis_id" : [df_obis .loc [df_obis ["obis_title" ] == title , "obis_id" ].to_string (index = False )],
595+ "doi" : [df ["results" ].to_numpy ()[0 ][0 ]["doi" ]],
596+ },
597+ ),
598+ ],
599+ ignore_index = True ,
600+ )
591601
592602 df_gbif = pd .DataFrame ()
593- for key in df_mapping ['gbif_uuid' ]:
594-
595- url = 'https://api.gbif.org/v1/literature/export?format=CSV&gbifDatasetKey={}' .format (key )
596- df2 = pd .read_csv (url ) # collect liturature cited information
597- df2 .columns = ['literature_' + str (col ) for col in df2 .columns ]
598- df2 ['gbif_uuid' ] = key
603+ for key in df_mapping ["gbif_uuid" ]:
604+ url = f"https://api.gbif.org/v1/literature/export?format=CSV&gbifDatasetKey={ key } "
605+ df2 = pd .read_csv (url ) # collect literature cited information
606+ df2 .columns = ["literature_" + str (col ) for col in df2 .columns ]
607+ df2 ["gbif_uuid" ] = key
599608
600- df_gbif = pd .concat ([df2 ,df_gbif ], ignore_index = True )
609+ df_gbif = pd .concat ([df2 , df_gbif ], ignore_index = True )
601610
602611 # merge the OBIS and GBIF data frames together
603- df_obis = df_obis .merge (df_mapping , on = ' obis_id' )
612+ df_obis = df_obis .merge (df_mapping , on = " obis_id" )
604613
605614 # add gbif download stats
606615
607- for key in df_obis ['gbif_uuid' ]:
608- url = f'https://api.gbif.org/v1/occurrence/download/statistics/export?datasetKey={ key } '
609- df2 = pd .read_csv (url ,sep = '\t ' )
610- df2_group = df2 .groupby ('year' ).agg ({'number_downloads' :'sum' })
611-
612- df_obis .loc [df_obis ['gbif_uuid' ]== key ,'gbif_downloads' ] = str (df2_group .to_dict ())
613-
614- df_out = df_gbif .merge (df_obis , on = 'gbif_uuid' )
615-
616- return df_out
616+ for key in df_obis ["gbif_uuid" ]:
617+ url = f"https://api.gbif.org/v1/occurrence/download/statistics/export?datasetKey={ key } "
618+ df2 = pd .read_csv (url , sep = "\t " )
619+ df2_group = df2 .groupby ("year" ).agg ({"number_downloads" : "sum" })
617620
621+ df_obis .loc [df_obis ["gbif_uuid" ] == key , "gbif_downloads" ] = str (df2_group .to_dict ())
618622
623+ return df_gbif .merge (df_obis , on = "gbif_uuid" )
619624
620625
621626def update_metrics (* , debug = False ):
0 commit comments