|
30 | 30 |
|
31 | 31 | logger = logging.getLogger(os.path.basename(__file__)) |
32 | 32 |
|
| 33 | +__BD_BORDERS = gpd.read_file(gpd.datasets.get_path("naturalearth_lowres")) |
33 | 34 |
|
34 | 35 | def gbif_doi_search(doi): |
35 | 36 | """ |
@@ -116,7 +117,7 @@ def gbif_download_request(uuid, timeout, api_mail, api_user, api_pass): |
116 | 117 | continue |
117 | 118 |
|
118 | 119 | # Espera 20 segundos antes de realizar la siguiente verificación |
119 | | - time.sleep(20 - (time.time() - t1)) |
| 120 | + time.sleep(10 - (time.time() - t1)) |
120 | 121 |
|
121 | 122 | # Imprime el estado actual de la descarga |
122 | 123 | logger.debug( |
@@ -311,33 +312,49 @@ def taxonomic_percentajes(df): |
311 | 312 | total_data = len(df) |
312 | 313 |
|
313 | 314 | # Porcentaje de géneros que están presentes en el catálogo de vida (Species2000) |
314 | | - percentaje_genus = ( |
315 | | - df.value_counts(subset=["genus"], dropna=False) |
316 | | - .reset_index(name="N") |
317 | | - .apply(is_in_catalogue_of_life, axis=1) |
318 | | - .sum() |
319 | | - / total_data |
320 | | - * 100 |
321 | | - ) |
| 315 | + try: |
| 316 | + percentaje_genus = ( |
| 317 | + df.value_counts(subset=["genus"], dropna=False) |
| 318 | + .reset_index(name="N") |
| 319 | + .apply(is_in_catalogue_of_life, axis=1) |
| 320 | + .sum() |
| 321 | + / total_data |
| 322 | + * 100 |
| 323 | + ) |
| 324 | + except Exception as e: |
| 325 | + logger.debug(f"ERROR genus - {e}") |
| 326 | + percentaje_genus = 0 |
322 | 327 |
|
323 | 328 | # Porcentaje de especies presentes en el DataFrame. |
324 | | - percentaje_species = df["specificEpithet"].count() / total_data * 100 |
| 329 | + try: |
| 330 | + percentaje_species = df["specificEpithet"].count() / total_data * 100 |
| 331 | + except Exception as e: |
| 332 | + logger.debug(f"ERROR specificEpithet - {e}") |
| 333 | + percentaje_species = 0 |
325 | 334 |
|
326 | 335 | # Porcentaje de calidad para la jerarquía taxonómica |
327 | | - percentaje_hierarchy = ( |
328 | | - df.value_counts( |
329 | | - subset=["higherClassification", "kingdom", "class", "order", "family"], |
330 | | - dropna=False, |
| 336 | + try: |
| 337 | + percentaje_hierarchy = ( |
| 338 | + df.value_counts( |
| 339 | + subset=["higherClassification", "kingdom", "class", "order", "family"], |
| 340 | + dropna=False, |
| 341 | + ) |
| 342 | + .reset_index(name="N") |
| 343 | + .apply(hierarchy_weights, axis=1) |
| 344 | + .sum() |
| 345 | + / total_data |
| 346 | + * 100 |
331 | 347 | ) |
332 | | - .reset_index(name="N") |
333 | | - .apply(hierarchy_weights, axis=1) |
334 | | - .sum() |
335 | | - / total_data |
336 | | - * 100 |
337 | | - ) |
| 348 | + except Exception as e: |
| 349 | + logger.debug(f"ERROR hierarchy - {e}") |
| 350 | + percentaje_hierarchy = 0 |
338 | 351 |
|
339 | 352 | # Porcentaje de identificadores disponibles en el DataFrame |
340 | | - percentaje_identifiers = df["identifiedBy"].count() / total_data * 100 |
| 353 | + try: |
| 354 | + percentaje_identifiers = df["identifiedBy"].count() / total_data * 100 |
| 355 | + except Exception as e: |
| 356 | + logger.debug(f"ERROR identifiedBy - {e}") |
| 357 | + percentaje_identifiers = 0 |
341 | 358 |
|
342 | 359 | # Porcentaje total de calidad taxonómica combinando los porcentajes ponderados |
343 | 360 | percentaje_taxonomic = ( |
@@ -388,43 +405,59 @@ def geographic_percentajes(df): |
388 | 405 | total_data = len(df) |
389 | 406 |
|
390 | 407 | # Porcentaje de ocurrencias con coordenadas válidas (latitud y longitud presentes) |
391 | | - percentaje_coordinates = ( |
392 | | - len(df[df["decimalLatitude"].notnull() & df["decimalLongitude"].notnull()]) |
393 | | - / total_data |
394 | | - * 100 |
395 | | - ) |
396 | | - |
| 408 | + try: |
| 409 | + percentaje_coordinates = ( |
| 410 | + len(df[df["decimalLatitude"].notnull() & df["decimalLongitude"].notnull()]) |
| 411 | + / total_data |
| 412 | + * 100 |
| 413 | + ) |
| 414 | + except Exception as e: |
| 415 | + logger.debug(f"ERROR coordinates - {e}") |
| 416 | + percentaje_coordinates = 0 |
| 417 | + |
397 | 418 | # Porcentaje de ocurrencias con códigos de país válidos |
398 | | - percentaje_countries = ( |
399 | | - df.value_counts( |
400 | | - subset=["countryCode"], |
401 | | - dropna=False, |
| 419 | + try: |
| 420 | + percentaje_countries = ( |
| 421 | + df.value_counts( |
| 422 | + subset=["countryCode"], |
| 423 | + dropna=False, |
| 424 | + ) |
| 425 | + .reset_index(name="N") |
| 426 | + .apply(is_valid_country_code, axis=1) |
| 427 | + .sum() |
| 428 | + / total_data |
| 429 | + * 100 |
402 | 430 | ) |
403 | | - .reset_index(name="N") |
404 | | - .apply(is_valid_country_code, axis=1) |
405 | | - .sum() |
406 | | - / total_data |
407 | | - * 100 |
408 | | - ) |
| 431 | + except Exception as e: |
| 432 | + logger.debug(f"ERROR countries - {e}") |
| 433 | + percentaje_countries = 0 |
409 | 434 |
|
410 | 435 | # Porcentaje de ocurrencias con incertidumbre en las coordenadas |
411 | | - percentaje_coordinates_uncertainty = ( |
412 | | - len(df[df.coordinateUncertaintyInMeters > 0]) / total_data * 100 |
413 | | - ) |
| 436 | + try: |
| 437 | + percentaje_coordinates_uncertainty = ( |
| 438 | + len(df[df.coordinateUncertaintyInMeters > 0]) / total_data * 100 |
| 439 | + ) |
| 440 | + except Exception as e: |
| 441 | + logger.debug(f"ERROR coordinates uncertainty - {e}") |
| 442 | + percentaje_coordinates_uncertainty = 0 |
414 | 443 |
|
415 | 444 | # Porcentaje de ocurrencias con coordenadas incorrectas |
416 | | - percentaje_incorrect_coordinates = ( |
417 | | - df.round(3) |
418 | | - .value_counts( |
419 | | - subset=["decimalLatitude", "decimalLongitude", "countryCode"], |
420 | | - dropna=False, |
| 445 | + try: |
| 446 | + percentaje_incorrect_coordinates = ( |
| 447 | + df.round(3) |
| 448 | + .value_counts( |
| 449 | + subset=["decimalLatitude", "decimalLongitude", "countryCode"], |
| 450 | + dropna=False, |
| 451 | + ) |
| 452 | + .reset_index(name="N") |
| 453 | + .apply(is_incorrect_coordinate, axis=1) |
| 454 | + .sum() |
| 455 | + / total_data |
| 456 | + * 100 |
421 | 457 | ) |
422 | | - .reset_index(name="N") |
423 | | - .apply(is_incorrect_coordinate, axis=1) |
424 | | - .sum() |
425 | | - / total_data |
426 | | - * 100 |
427 | | - ) |
| 458 | + except Exception as e: |
| 459 | + logger.debug(f"ERROR incorrect coordinates - {e}") |
| 460 | + percentaje_incorrect_coordinates = 0 |
428 | 461 |
|
429 | 462 | # Porcentaje total de calidad geográfica combinando los porcentajes ponderados |
430 | 463 | percentaje_geographic = ( |
@@ -474,29 +507,60 @@ def temporal_percentajes(df): |
474 | 507 | # Total de ocurrencias |
475 | 508 | total_data = len(df) |
476 | 509 |
|
| 510 | + def safe_date(date): |
| 511 | + try: |
| 512 | + return str(pd.to_datetime(date)) |
| 513 | + except Exception as e: |
| 514 | + # print(e) |
| 515 | + return date |
| 516 | + |
477 | 517 | # Columna de fechas |
478 | | - dates = pd.to_datetime( |
479 | | - df[df.eventDate.notnull()].eventDate, |
480 | | - # infer_datetime_format=True, |
481 | | - errors="coerce", |
482 | | - ) |
| 518 | + dates = df[df.eventDate.notnull()].copy() |
| 519 | + if dates.empty: |
| 520 | + return {"Temporal": 0, "Years": 0, "Months": 0, "Days": 0, "IncorrectDates": 0} |
| 521 | + dates["date"] = dates.eventDate.apply(safe_date) |
483 | 522 |
|
484 | 523 | # Porcentaje de años validos |
485 | | - years = dates.dt.year |
486 | | - percentaje_years = ( |
487 | | - sum((years >= 0) & (years <= datetime.date.today().year)) / total_data * 100 |
488 | | - ) |
| 524 | + try: |
| 525 | + dates["year"] = dates.date.str[:4].astype("Int64") |
| 526 | + percentaje_years = ( |
| 527 | + sum((dates.year >= 0) & (dates.year <= datetime.date.today().year)) |
| 528 | + / total_data |
| 529 | + * 100 |
| 530 | + ) |
| 531 | + except Exception as e: |
| 532 | + logger.debug(f"ERROR year - {e}") |
| 533 | + percentaje_years = 0 |
489 | 534 |
|
490 | 535 | # Porcentaje de meses validos |
491 | | - months = dates.dt.month |
492 | | - percentaje_months = sum((months >= 1) & (months <= 12)) / total_data * 100 |
| 536 | + try: |
| 537 | + dates["month"] = dates.date.str[5:7].astype("Int64") |
| 538 | + percentaje_months = ( |
| 539 | + sum((dates.month >= 1) & (dates.month <= 12)) / total_data * 100 |
| 540 | + ) |
| 541 | + except Exception as e: |
| 542 | + logger.debug(f"ERROR month - {e}") |
| 543 | + percentaje_months = 0 |
493 | 544 |
|
494 | 545 | # Porcentaje de días validos |
495 | | - days = dates.dt.day |
496 | | - percentaje_days = sum((days >= 1) & (days <= 31)) / total_data * 100 |
| 546 | + try: |
| 547 | + dates["day"] = dates.date.str[8:10].astype("Int64") |
| 548 | + percentaje_days = sum((dates.day >= 1) & (dates.day <= 31)) / total_data * 100 |
| 549 | + except Exception as e: |
| 550 | + logger.debug(f"ERROR day - {e}") |
| 551 | + percentaje_days = 0 |
497 | 552 |
|
498 | 553 | # Porcentaje de fechas incorrectas |
499 | | - percentaje_incorrect_dates = sum(dates.isnull()) / total_data * 100 |
| 554 | + try: |
| 555 | + dates["correct"] = dates.date.apply( |
| 556 | + lambda x: bool( |
| 557 | + re.match(r"^\d{4}-\d{2}-\d{2} \d{2}:\d{2}:\d{2}$", x.strip()) |
| 558 | + ) |
| 559 | + ) |
| 560 | + percentaje_incorrect_dates = sum(~dates.correct) / total_data * 100 |
| 561 | + except Exception as e: |
| 562 | + logger.debug(f"ERROR incorrect dates - {e}") |
| 563 | + percentaje_incorrect_dates = 0 |
500 | 564 |
|
501 | 565 | # Porcentaje total de calidad temporal combinando los porcentajes ponderados |
502 | 566 | percentaje_temporal = ( |
@@ -584,7 +648,7 @@ def coordinate_in_country(codigo_pais, latitud, longitud): |
584 | 648 | pais = pycountry.countries.get(alpha_2=codigo_pais).alpha_3 |
585 | 649 | if pais: |
586 | 650 | # Cargamos el conjunto de datos de límites de países |
587 | | - world = gpd.read_file(gpd.datasets.get_path("naturalearth_lowres")) |
| 651 | + world = __BD_BORDERS.copy() |
588 | 652 |
|
589 | 653 | # Obtenemos el polígono del país |
590 | 654 | poligono_pais = world[world["iso_a3"] == pais].geometry.squeeze() |
|
0 commit comments