Skip to content

Commit 694be1f

Browse files
committed
Optmization and error handling
1 parent 5fb3774 commit 694be1f

File tree

2 files changed

+131
-67
lines changed

2 files changed

+131
-67
lines changed

plugins/gbif/gbif_data.py

Lines changed: 130 additions & 66 deletions
Original file line numberDiff line numberDiff line change
@@ -30,6 +30,7 @@
3030

3131
logger = logging.getLogger(os.path.basename(__file__))
3232

33+
__BD_BORDERS = gpd.read_file(gpd.datasets.get_path("naturalearth_lowres"))
3334

3435
def gbif_doi_search(doi):
3536
"""
@@ -116,7 +117,7 @@ def gbif_download_request(uuid, timeout, api_mail, api_user, api_pass):
116117
continue
117118

118119
# Espera 20 segundos antes de realizar la siguiente verificación
119-
time.sleep(20 - (time.time() - t1))
120+
time.sleep(10 - (time.time() - t1))
120121

121122
# Imprime el estado actual de la descarga
122123
logger.debug(
@@ -311,33 +312,49 @@ def taxonomic_percentajes(df):
311312
total_data = len(df)
312313

313314
# Porcentaje de géneros que están presentes en el catálogo de vida (Species2000)
314-
percentaje_genus = (
315-
df.value_counts(subset=["genus"], dropna=False)
316-
.reset_index(name="N")
317-
.apply(is_in_catalogue_of_life, axis=1)
318-
.sum()
319-
/ total_data
320-
* 100
321-
)
315+
try:
316+
percentaje_genus = (
317+
df.value_counts(subset=["genus"], dropna=False)
318+
.reset_index(name="N")
319+
.apply(is_in_catalogue_of_life, axis=1)
320+
.sum()
321+
/ total_data
322+
* 100
323+
)
324+
except Exception as e:
325+
logger.debug(f"ERROR genus - {e}")
326+
percentaje_genus = 0
322327

323328
# Porcentaje de especies presentes en el DataFrame.
324-
percentaje_species = df["specificEpithet"].count() / total_data * 100
329+
try:
330+
percentaje_species = df["specificEpithet"].count() / total_data * 100
331+
except Exception as e:
332+
logger.debug(f"ERROR specificEpithet - {e}")
333+
percentaje_species = 0
325334

326335
# Porcentaje de calidad para la jerarquía taxonómica
327-
percentaje_hierarchy = (
328-
df.value_counts(
329-
subset=["higherClassification", "kingdom", "class", "order", "family"],
330-
dropna=False,
336+
try:
337+
percentaje_hierarchy = (
338+
df.value_counts(
339+
subset=["higherClassification", "kingdom", "class", "order", "family"],
340+
dropna=False,
341+
)
342+
.reset_index(name="N")
343+
.apply(hierarchy_weights, axis=1)
344+
.sum()
345+
/ total_data
346+
* 100
331347
)
332-
.reset_index(name="N")
333-
.apply(hierarchy_weights, axis=1)
334-
.sum()
335-
/ total_data
336-
* 100
337-
)
348+
except Exception as e:
349+
logger.debug(f"ERROR hierarchy - {e}")
350+
percentaje_hierarchy = 0
338351

339352
# Porcentaje de identificadores disponibles en el DataFrame
340-
percentaje_identifiers = df["identifiedBy"].count() / total_data * 100
353+
try:
354+
percentaje_identifiers = df["identifiedBy"].count() / total_data * 100
355+
except Exception as e:
356+
logger.debug(f"ERROR identifiedBy - {e}")
357+
percentaje_identifiers = 0
341358

342359
# Porcentaje total de calidad taxonómica combinando los porcentajes ponderados
343360
percentaje_taxonomic = (
@@ -388,43 +405,59 @@ def geographic_percentajes(df):
388405
total_data = len(df)
389406

390407
# Porcentaje de ocurrencias con coordenadas válidas (latitud y longitud presentes)
391-
percentaje_coordinates = (
392-
len(df[df["decimalLatitude"].notnull() & df["decimalLongitude"].notnull()])
393-
/ total_data
394-
* 100
395-
)
396-
408+
try:
409+
percentaje_coordinates = (
410+
len(df[df["decimalLatitude"].notnull() & df["decimalLongitude"].notnull()])
411+
/ total_data
412+
* 100
413+
)
414+
except Exception as e:
415+
logger.debug(f"ERROR coordinates - {e}")
416+
percentaje_coordinates = 0
417+
397418
# Porcentaje de ocurrencias con códigos de país válidos
398-
percentaje_countries = (
399-
df.value_counts(
400-
subset=["countryCode"],
401-
dropna=False,
419+
try:
420+
percentaje_countries = (
421+
df.value_counts(
422+
subset=["countryCode"],
423+
dropna=False,
424+
)
425+
.reset_index(name="N")
426+
.apply(is_valid_country_code, axis=1)
427+
.sum()
428+
/ total_data
429+
* 100
402430
)
403-
.reset_index(name="N")
404-
.apply(is_valid_country_code, axis=1)
405-
.sum()
406-
/ total_data
407-
* 100
408-
)
431+
except Exception as e:
432+
logger.debug(f"ERROR countries - {e}")
433+
percentaje_countries = 0
409434

410435
# Porcentaje de ocurrencias con incertidumbre en las coordenadas
411-
percentaje_coordinates_uncertainty = (
412-
len(df[df.coordinateUncertaintyInMeters > 0]) / total_data * 100
413-
)
436+
try:
437+
percentaje_coordinates_uncertainty = (
438+
len(df[df.coordinateUncertaintyInMeters > 0]) / total_data * 100
439+
)
440+
except Exception as e:
441+
logger.debug(f"ERROR coordinates uncertainty - {e}")
442+
percentaje_coordinates_uncertainty = 0
414443

415444
# Porcentaje de ocurrencias con coordenadas incorrectas
416-
percentaje_incorrect_coordinates = (
417-
df.round(3)
418-
.value_counts(
419-
subset=["decimalLatitude", "decimalLongitude", "countryCode"],
420-
dropna=False,
445+
try:
446+
percentaje_incorrect_coordinates = (
447+
df.round(3)
448+
.value_counts(
449+
subset=["decimalLatitude", "decimalLongitude", "countryCode"],
450+
dropna=False,
451+
)
452+
.reset_index(name="N")
453+
.apply(is_incorrect_coordinate, axis=1)
454+
.sum()
455+
/ total_data
456+
* 100
421457
)
422-
.reset_index(name="N")
423-
.apply(is_incorrect_coordinate, axis=1)
424-
.sum()
425-
/ total_data
426-
* 100
427-
)
458+
except Exception as e:
459+
logger.debug(f"ERROR incorrect coordinates - {e}")
460+
percentaje_incorrect_coordinates = 0
428461

429462
# Porcentaje total de calidad geográfica combinando los porcentajes ponderados
430463
percentaje_geographic = (
@@ -474,29 +507,60 @@ def temporal_percentajes(df):
474507
# Total de ocurrencias
475508
total_data = len(df)
476509

510+
def safe_date(date):
511+
try:
512+
return str(pd.to_datetime(date))
513+
except Exception as e:
514+
# print(e)
515+
return date
516+
477517
# Columna de fechas
478-
dates = pd.to_datetime(
479-
df[df.eventDate.notnull()].eventDate,
480-
# infer_datetime_format=True,
481-
errors="coerce",
482-
)
518+
dates = df[df.eventDate.notnull()].copy()
519+
if dates.empty:
520+
return {"Temporal": 0, "Years": 0, "Months": 0, "Days": 0, "IncorrectDates": 0}
521+
dates["date"] = dates.eventDate.apply(safe_date)
483522

484523
# Porcentaje de años validos
485-
years = dates.dt.year
486-
percentaje_years = (
487-
sum((years >= 0) & (years <= datetime.date.today().year)) / total_data * 100
488-
)
524+
try:
525+
dates["year"] = dates.date.str[:4].astype("Int64")
526+
percentaje_years = (
527+
sum((dates.year >= 0) & (dates.year <= datetime.date.today().year))
528+
/ total_data
529+
* 100
530+
)
531+
except Exception as e:
532+
logger.debug(f"ERROR year - {e}")
533+
percentaje_years = 0
489534

490535
# Porcentaje de meses validos
491-
months = dates.dt.month
492-
percentaje_months = sum((months >= 1) & (months <= 12)) / total_data * 100
536+
try:
537+
dates["month"] = dates.date.str[5:7].astype("Int64")
538+
percentaje_months = (
539+
sum((dates.month >= 1) & (dates.month <= 12)) / total_data * 100
540+
)
541+
except Exception as e:
542+
logger.debug(f"ERROR month - {e}")
543+
percentaje_months = 0
493544

494545
# Porcentaje de días validos
495-
days = dates.dt.day
496-
percentaje_days = sum((days >= 1) & (days <= 31)) / total_data * 100
546+
try:
547+
dates["day"] = dates.date.str[8:10].astype("Int64")
548+
percentaje_days = sum((dates.day >= 1) & (dates.day <= 31)) / total_data * 100
549+
except Exception as e:
550+
logger.debug(f"ERROR day - {e}")
551+
percentaje_days = 0
497552

498553
# Porcentaje de fechas incorrectas
499-
percentaje_incorrect_dates = sum(dates.isnull()) / total_data * 100
554+
try:
555+
dates["correct"] = dates.date.apply(
556+
lambda x: bool(
557+
re.match(r"^\d{4}-\d{2}-\d{2} \d{2}:\d{2}:\d{2}$", x.strip())
558+
)
559+
)
560+
percentaje_incorrect_dates = sum(~dates.correct) / total_data * 100
561+
except Exception as e:
562+
logger.debug(f"ERROR incorrect dates - {e}")
563+
percentaje_incorrect_dates = 0
500564

501565
# Porcentaje total de calidad temporal combinando los porcentajes ponderados
502566
percentaje_temporal = (
@@ -584,7 +648,7 @@ def coordinate_in_country(codigo_pais, latitud, longitud):
584648
pais = pycountry.countries.get(alpha_2=codigo_pais).alpha_3
585649
if pais:
586650
# Cargamos el conjunto de datos de límites de países
587-
world = gpd.read_file(gpd.datasets.get_path("naturalearth_lowres"))
651+
world = __BD_BORDERS.copy()
588652

589653
# Obtenemos el polígono del país
590654
poligono_pais = world[world["iso_a3"] == pais].geometry.squeeze()

plugins/gbif/plugin.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -329,7 +329,7 @@ def data_01(self):
329329
330330
<tr>
331331
<td bgcolor="#B2B0B0"> <b> Geographic </b> </td>
332-
<td bgcolor={self.get_color(ica["Geographic"])}> <b> {ica["Geographic"]}% </b> </td>
332+
<td bgcolor={self.get_color(ica["Geographic"])}> <b> {ica["Geographic"]:.2f}% </b> </td>
333333
</tr>
334334
<tr>
335335
<td bgcolor="#D5D5D5"> Coordinates </td>

0 commit comments

Comments
 (0)