1414def generate_iucn_to_gbif_map (
1515 collated_data_path : Path ,
1616 output_dir_path : Path ,
17+ taxa : str ,
1718) -> pd .DataFrame :
1819 collated_data = pd .read_csv (collated_data_path )
1920
2021 # To save spamming the GBIF API, see if there's already a map
2122 # and if so we just request GBIF IDs for data we've not seen before
2223 map_filename = output_dir_path / "map.csv"
23- id_map : Dict [int ,Tuple [str ,str ,int ,Optional [int ]]] = {}
24+ id_map : Dict [int ,Tuple [str ,str ,int ,Optional [int ], str ]] = {}
2425 try :
2526 existing_map = pd .read_csv (map_filename )
2627 for _ , row in existing_map .iterrows ():
27- id_map [row .iucn_taxon_id ] = (row .iucn_taxon_id , row .scientific_name , row .assessment_year , row .gbif_id )
28+ id_map [row .iucn_taxon_id ] = (
29+ row .iucn_taxon_id ,
30+ row .scientific_name ,
31+ row .assessment_year ,
32+ row .gbif_id ,
33+ row .class_name
34+ )
2835 except (AttributeError , FileNotFoundError ):
2936 pass
3037
3138 # First we make a map
32- for _ , row in collated_data .iterrows ():
39+ for _ , row in collated_data [ collated_data . class_name == taxa ] .iterrows ():
3340 taxon_id = row .id_no
3441 if taxon_id in id_map :
42+ print (f"skipping { taxon_id } " )
3543 continue
3644 assessment_year = row .assessment_year
3745 scientific_name = row .scientific_name
46+ class_name = row .class_name
3847
3948 if not assessment_year :
4049 continue
@@ -47,27 +56,28 @@ def generate_iucn_to_gbif_map(
4756 raise ValueError ("no match found" )
4857 gbif_id = result ["usageKey" ]
4958
50- id_map [taxon_id ] = (taxon_id , scientific_name , assessment_year , int (gbif_id ))
59+ id_map [taxon_id ] = (taxon_id , scientific_name , assessment_year , int (gbif_id ), class_name )
5160 except (KeyError , ValueError ):
52- id_map [taxon_id ] = (taxon_id , scientific_name , assessment_year , None )
53- except requests .exceptions .ConnectionError :
61+ id_map [taxon_id ] = (taxon_id , scientific_name , assessment_year , None , class_name )
62+ except requests .exceptions .ConnectionError as exc :
5463 # GBIF is not longer happy to talk to us? We should cache whatever data we already
5564 # have and give up
5665 map_data = id_map .values ()
5766 map_df = pd .DataFrame (
5867 map_data ,
59- columns = ["iucn_taxon_id" , "scientific_name" , "assessment_year" , "gbif_id" ],
68+ columns = ["iucn_taxon_id" , "scientific_name" , "assessment_year" , "gbif_id" , "class_name" ],
6069 )
6170 map_df ["gbif_id" ] = map_df ["gbif_id" ].astype ('Int64' )
6271 map_df .to_csv (map_filename , index = False )
72+ print (exc )
6373 sys .exit ("Connection error from GBIF, aborting." )
6474
65- time .sleep (0.1 ) # rate limiting
75+ time .sleep (0.5 ) # rate limiting
6676
6777 map_data = id_map .values ()
6878 map_df = pd .DataFrame (
6979 map_data ,
70- columns = ["iucn_taxon_id" , "scientific_name" , "assessment_year" , "gbif_id" ],
80+ columns = ["iucn_taxon_id" , "scientific_name" , "assessment_year" , "gbif_id" , "class_name" ],
7181 )
7282 map_df ["gbif_id" ] = map_df ["gbif_id" ].astype ('Int64' )
7383 map_df .to_csv (map_filename , index = False )
@@ -76,40 +86,49 @@ def generate_iucn_to_gbif_map(
7686
7787def build_gbif_query (id_map : pd .DataFrame ) -> Any :
7888
79- map_with_gbif_id = id_map [id_map .gbif_id is not None ]
89+ map_with_gbif_id = id_map [id_map .gbif_id .notna ()]
90+ request_data = map_with_gbif_id [["assessment_year" , "gbif_id" ]]
91+
92+ # There should be tens of assessment years vs thousands of species, so we can use that to reduce the query count:
93+ grouped = request_data .groupby ('assessment_year' )['gbif_id' ].apply (list )
8094
8195 queries = [
8296 {
8397 "type" : "and" ,
8498 "predicates" : [
8599 {
86- "type" : "equals " ,
100+ "type" : "in " ,
87101 "key" : "TAXON_KEY" ,
88- "value " : int (gbif_id ),
102+ "values " : [ str ( int (gbif_id )) for gbif_id in gbif_ids ]
89103 },
90104 {
91105 "type" : "greaterThan" ,
92106 "key" : "YEAR" ,
93- "value" : int (assessment_year ),
107+ "value" : str ( int (assessment_year )), # type: ignore
94108 },
95- {
96- "type" : "equals" ,
97- "key" : "HAS_COORDINATE" ,
98- "value" : "TRUE"
99- },
100- {
101- "type" : "equals" ,
102- "key" : "HAS_GEOSPATIAL_ISSUE" ,
103- "value" : "FALSE"
104- }
105109 ]
106110 }
107- for _ , _ , assessment_year , gbif_id in map_with_gbif_id . itertuples ( index = False )
111+ for assessment_year , gbif_ids in grouped . items ( )
108112 ]
109113
110114 return {
111- "type" : "or" ,
112- "predicates" : queries
115+ "type" : "and" ,
116+ "predicates" : [
117+ {
118+ "type" : "or" ,
119+ "predicates" : queries
120+ },
121+ {
122+ "type" : "equals" ,
123+ "key" : "HAS_COORDINATE" ,
124+ "value" : "TRUE"
125+ },
126+ {
127+ "type" : "equals" ,
128+ "key" : "HAS_GEOSPATIAL_ISSUE" ,
129+ "value" : "FALSE"
130+ },
131+ ]
113132 }
114133
115134def build_point_validation_table (
@@ -125,19 +144,21 @@ def build_point_validation_table(
125144
126145def fetch_gbif_data (
127146 collated_data_path : Path ,
128- gbif_username : str ,
147+ taxa : str ,
148+ gbif_username : str ,
129149 gbif_email : str ,
130150 gbif_password : str ,
131- output_dir_path : Path ,
151+ toplevel_output_dir_path : Path ,
132152) -> None :
133- final_result_path = output_dir_path / "points.csv"
153+ taxa_output_dir_path = toplevel_output_dir_path / taxa
154+ final_result_path = taxa_output_dir_path / "points.csv"
134155 if final_result_path .exists ():
135156 return
136157
137- os .makedirs (output_dir_path , exist_ok = True )
138- download_key_cache_filename = output_dir_path / "download_key"
158+ os .makedirs (taxa_output_dir_path , exist_ok = True )
159+ download_key_cache_filename = taxa_output_dir_path / "download_key"
139160
140- map_df = generate_iucn_to_gbif_map (collated_data_path , output_dir_path )
161+ map_df = generate_iucn_to_gbif_map (collated_data_path , taxa_output_dir_path , taxa )
141162 if map_df is None or len (map_df ) == 0 :
142163 sys .exit ("No specices in GBIF ID list, aborting" )
143164
@@ -147,16 +168,16 @@ def fetch_gbif_data(
147168 request .add_predicate_dict (query )
148169
149170 download_key = request .post_download (gbif_username , gbif_password )
150- download_key_cache_filename = output_dir_path / "download_key"
171+ download_key_cache_filename = taxa_output_dir_path / "download_key"
151172 with open (download_key_cache_filename , "w" , encoding = "UTF-8" ) as f :
152173 f .write (download_key )
153174 else :
154175 with open (download_key_cache_filename , "r" , encoding = "UTF-8" ) as f :
155176 download_key = f .read ()
156177
157- expected_csv = output_dir_path / f"{ download_key } .csv"
178+ expected_csv = taxa_output_dir_path / f"{ download_key } .csv"
158179 if not expected_csv .exists ():
159- expected_download = output_dir_path / f"{ download_key } .zip"
180+ expected_download = taxa_output_dir_path / f"{ download_key } .zip"
160181 if not expected_download .exists ():
161182 while True :
162183 metadata = pygbif .occurrences .download_meta (download_key )
@@ -166,13 +187,13 @@ def fetch_gbif_data(
166187 time .sleep (30.0 )
167188 continue
168189 case "SUCCEEDED" :
169- file_path = pygbif .occurrences .download_get (download_key , path = output_dir_path )
190+ file_path = pygbif .occurrences .download_get (download_key , path = taxa_output_dir_path )
170191 print (f"Results are in { file_path } " )
171192 break
172193 case _:
173194 sys .exit (f"Failed to download data, status: { metadata ['status' ]} " )
174195 with zipfile .ZipFile (expected_download , 'r' ) as zip_file :
175- zip_file .extractall (output_dir_path )
196+ zip_file .extractall (taxa_output_dir_path )
176197 if not expected_csv .exists ():
177198 sys .exit ("Extracted GBIF zip did not contain expected CSV file" )
178199
@@ -221,6 +242,12 @@ def main() -> None:
221242 help = "Password of user's GBIF account. Can also be set in environment." ,
222243 dest = "gbif_password" ,
223244 )
245+ parser .add_argument (
246+ '--taxa' ,
247+ type = str ,
248+ required = True ,
249+ dest = 'taxa' ,
250+ )
224251 parser .add_argument (
225252 "--output_dir" ,
226253 type = Path ,
@@ -239,6 +266,7 @@ def main() -> None:
239266
240267 fetch_gbif_data (
241268 args .collated_data_path ,
269+ args .taxa ,
242270 args .gbif_username ,
243271 args .gbif_email ,
244272 args .gbif_password ,
0 commit comments