|
11 | 11 | from concurrent.futures import ThreadPoolExecutor
|
12 | 12 |
|
13 | 13 | import geopandas as gpd
|
| 14 | +import numpy as np |
14 | 15 | import pandas as pd
|
15 | 16 | import pycountry
|
16 | 17 | import requests
|
@@ -146,6 +147,38 @@ def gbif_doi_download(doi: str, timeout=-1, auth=None):
|
146 | 147 | except Exception as e:
|
147 | 148 | logger.debug(f"ERROR Searching Data: {e}")
|
148 | 149 | return download_dict
|
| 150 | + |
| 151 | + logger.debug("Intentando IPT") |
| 152 | + endpoints = search_request["endpoints"] |
| 153 | + logger.debug(endpoints) |
| 154 | + for ep in endpoints: |
| 155 | + logger.debug("Probando endpoints") |
| 156 | + if ep.get("type") == "DWC_ARCHIVE" and ep.get("url"): |
| 157 | + url = ep["url"] |
| 158 | + logger.debug(f"Intentando descarga directa desde endpoint DWC_ARCHIVE: {url}") |
| 159 | + try: |
| 160 | + os.makedirs(os.path.dirname(download_dict["path"]), exist_ok=True) |
| 161 | + # Hacemos el GET con timeout razonable (por ejemplo 60s) |
| 162 | + with requests.get(url, stream=True, timeout=60) as resp: |
| 163 | + resp.raise_for_status() |
| 164 | + with open(download_dict["path"], "wb") as f: |
| 165 | + for chunk in resp.iter_content(chunk_size=8192): |
| 166 | + if chunk: |
| 167 | + f.write(chunk) |
| 168 | + # Si llegamos aquí, la descarga fue exitosa |
| 169 | + download_dict.update({ |
| 170 | + "download_url": url, |
| 171 | + "download_method": "endpoint", |
| 172 | + # opcionalmente, capturamos size si viene en headers |
| 173 | + "size": int(resp.headers.get("content-length", 0)) |
| 174 | + }) |
| 175 | + logger.debug("Descarga directa exitosa.") |
| 176 | + return download_dict |
| 177 | + except Exception as e: |
| 178 | + logger.debug(f"ERROR descarga directa desde endpoint: {e}") |
| 179 | + # si falla, seguimos al siguiente endpoint o al fallback |
| 180 | + continue |
| 181 | + |
149 | 182 |
|
150 | 183 | # Genera la solicitud de descarga
|
151 | 184 | logger.debug("Solicitud de Descarga")
|
@@ -224,7 +257,7 @@ def ICA(filepath):
|
224 | 257 | "countryCode",
|
225 | 258 | "coordinateUncertaintyInMeters",
|
226 | 259 | ]
|
227 |
| - temporal_columns = ["eventDate"] |
| 260 | + temporal_columns = ["eventDate", "verbatimEventDate", "year", "month", "day"] |
228 | 261 | try:
|
229 | 262 | df = results.pd_read(
|
230 | 263 | results.core_file_location,
|
@@ -546,84 +579,145 @@ def temporal_percentajes(df):
|
546 | 579 | Temporal: 63.45%
|
547 | 580 | {'Temporal': 63.45, 'Years': 25.6, 'Months': 15.2, 'Days': 18.9, 'IncorrectDates': 3.75}
|
548 | 581 | """
|
549 |
| - # Total de ocurrencias |
550 |
| - total_data = len(df) |
551 | 582 |
|
552 |
| - def safe_date(date): |
553 |
| - try: |
554 |
| - return str(pd.to_datetime(date)) |
555 |
| - except Exception as e: |
556 |
| - # print(e) |
557 |
| - return date |
| 583 | + # ── 0) Unificar eventDate: si existe verbatimEventDate y sus valores no están vacíos, |
| 584 | + # reemplazar en eventDate sólo donde éste sea nulo o cadena vacía. |
| 585 | + # ── 0) Unificar eventDate / verbatimEventDate ────────────────────────────── |
| 586 | + |
| 587 | + # Asegurarnos de tener copia y detectar columnas |
| 588 | + df = df.copy() |
| 589 | + has_ev = 'eventDate' in df.columns |
| 590 | + has_verb = 'verbatimEventDate' in df.columns |
| 591 | + |
| 592 | + if has_verb and not has_ev: |
| 593 | + # Sólo verbatimEventDate existe → lo renombramos |
| 594 | + df = df.rename(columns={'verbatimEventDate': 'eventDate'}) |
| 595 | + elif has_ev and has_verb: |
| 596 | + ev = df['eventDate'] |
| 597 | + verb = df['verbatimEventDate'] |
| 598 | + # Convertimos a str y recortamos espacios para detectar "" y "nan" |
| 599 | + ev_str = ev.astype(str).fillna('').str.strip().str.lower() |
| 600 | + # Mascara de “eventDate válido”: no nulo, no vacío, no "nan" |
| 601 | + valid_ev = ev.notna() & (ev_str != '') & (ev_str != 'nan') |
| 602 | + # Donde valid_ev es True, mantenemos ev; donde es False, tomamos verbatim |
| 603 | + df['eventDate'] = ev.where(valid_ev, verb) |
| 604 | + # Y quitamos ya la columna verbatim |
| 605 | + df = df.drop(columns=['verbatimEventDate']) |
| 606 | + # si sólo existía eventDate, no tocamos nada |
| 607 | + |
| 608 | + |
| 609 | + # ── 1) y siguientes: idéntico al anterior... |
| 610 | + total_data = len(df) |
558 | 611 |
|
559 |
| - # Columna de fechas |
560 |
| - dates = df[df.eventDate.notnull()].copy() |
561 |
| - if dates.empty: |
| 612 | + # Si no hay ninguna fecha, devolvemos la penalización directa |
| 613 | + if df['eventDate'].notna().sum() == 0: |
562 | 614 | return {
|
563 |
| - "Temporal": -15 * 0.2, |
| 615 | + "Temporal": -15, |
564 | 616 | "Years": 0,
|
565 | 617 | "Months": 0,
|
566 | 618 | "Days": 0,
|
567 | 619 | "IncorrectDates": -15,
|
568 | 620 | }
|
569 |
| - dates["date"] = dates.eventDate.apply(safe_date) |
570 | 621 |
|
571 |
| - # Porcentaje de años validos |
572 |
| - try: |
573 |
| - dates["year"] = df[df.year.notnull()].copy() |
574 |
| - percentaje_years = ( |
575 |
| - sum((dates.year >= 0) & (dates.year <= datetime.date.today().year)) |
576 |
| - / total_data |
577 |
| - * 100 |
578 |
| - ) |
579 |
| - except Exception as e: |
580 |
| - logger.debug(f"ERROR year - {e}") |
581 |
| - percentaje_years = 0 |
| 622 | + # Convertimos year/month/day a numérico (NaN si falla) |
| 623 | + if 'year' in df.columns: |
| 624 | + df['year'] = pd.to_numeric(df['year'], errors='coerce') |
| 625 | + if 'month' in df.columns: |
| 626 | + df['month'] = pd.to_numeric(df['month'], errors='coerce') |
| 627 | + if 'day' in df.columns: |
| 628 | + df['day'] = pd.to_numeric(df['day'], errors='coerce') |
| 629 | + |
| 630 | + # 1) Separa eventDate en hasta dos trozos |
| 631 | + date_splits = ( |
| 632 | + df["eventDate"] |
| 633 | + .astype(str) |
| 634 | + .str.strip() |
| 635 | + .str.split("/", n=1, expand=True) |
| 636 | + ) |
| 637 | + # Si sólo salió una columna, duplicarla |
| 638 | + if date_splits.shape[1] == 1: |
| 639 | + date_splits[1] = date_splits[0] |
| 640 | + # Rellenar vacíos o NaN de la segunda con la primera |
| 641 | + date_splits[1] = np.where( |
| 642 | + date_splits[1].eq("") | date_splits[1].isna(), |
| 643 | + date_splits[0], |
| 644 | + date_splits[1] |
| 645 | + ) |
| 646 | + # 2) Parseo a datetime (NaT si falla) |
| 647 | + df["start_date"] = pd.to_datetime(date_splits[0], errors="coerce") |
| 648 | + df["end_date"] = pd.to_datetime(date_splits[1], errors="coerce") |
582 | 649 |
|
583 |
| - # Porcentaje de meses validos |
584 |
| - try: |
585 |
| - dates["month"] = df[df.month.notnull()].copy() |
586 |
| - percentaje_months = ( |
587 |
| - sum((dates.month >= 1) & (dates.month <= 12)) / total_data * 100 |
588 |
| - ) |
589 |
| - except Exception as e: |
590 |
| - logger.debug(f"ERROR month - {e}") |
591 |
| - percentaje_months = 0 |
| 650 | + # Preparamos variables de salida |
| 651 | + percentage_years = percentage_months = percentage_day = 0 |
| 652 | + percentage_incorrect_dates = 100 |
592 | 653 |
|
593 |
| - # Porcentaje de días validos |
594 |
| - try: |
595 |
| - dates["day"] = df[df.day.notnull()].copy() |
596 |
| - percentaje_days = sum((dates.day >= 1) & (dates.day <= 31)) / total_data * 100 |
597 |
| - except Exception as e: |
598 |
| - logger.debug(f"ERROR day - {e}") |
599 |
| - percentaje_days = 0 |
| 654 | + # ── YEARS ─────────────────────────────────────────────────────────────────────── |
| 655 | + if 'year' in df.columns: |
| 656 | + df['start_year'] = df['start_date'].dt.year |
| 657 | + df['end_year'] = df['end_date'].dt.year |
600 | 658 |
|
601 |
| - # Porcentaje de fechas incorrectas |
602 |
| - try: |
603 |
| - dates["correct"] = dates.date.apply( |
604 |
| - lambda x: bool( |
605 |
| - re.match(r"^\d{4}-\d{2}-\d{2} \d{2}:\d{2}:\d{2}$", x.strip()) |
606 |
| - ) |
607 |
| - ) |
608 |
| - percentaje_incorrect_dates = sum(~dates.correct) / total_data * 100 |
609 |
| - except Exception as e: |
610 |
| - logger.debug(f"ERROR incorrect dates - {e}") |
611 |
| - percentaje_incorrect_dates = 0 |
612 |
| - |
613 |
| - # Porcentaje total de calidad temporal combinando los porcentajes ponderados |
614 |
| - percentaje_temporal = ( |
615 |
| - 0.11 * percentaje_years |
616 |
| - + 0.07 * percentaje_months |
617 |
| - + 0.02 * percentaje_days |
618 |
| - - 0.15 * percentaje_incorrect_dates |
| 659 | + df['year_valid'] = df['year'].between(df['start_year'], df['end_year']) |
| 660 | + valid_years = int(df['year_valid'].sum()) |
| 661 | + percentage_years = valid_years / total_data * 100 |
| 662 | + |
| 663 | + logger.debug(f"Filas con año válido: {valid_years}/{total_data} ({percentage_years:.2f}%)") |
| 664 | + else: |
| 665 | + logger.debug("Columna 'year' no existe: ano_valid = 0") |
| 666 | + |
| 667 | + # ── MONTHS ────────────────────────────────────────────────────────────────────── |
| 668 | + if 'month' in df.columns: |
| 669 | + df['start_month'] = df['start_date'].dt.month |
| 670 | + df['end_month'] = df['end_date'].dt.month |
| 671 | + |
| 672 | + # Si start_month o end_month son NaN, la comparación dará False |
| 673 | + df['month_valid'] = df['month'].between(df['start_month'], df['end_month']) |
| 674 | + valid_months = int(df['month_valid'].sum()) |
| 675 | + percentage_months = valid_months / total_data * 100 |
| 676 | + |
| 677 | + logger.debug(f"Filas con mes válido: {valid_months}/{total_data} ({percentage_months:.2f}%)") |
| 678 | + else: |
| 679 | + logger.debug("Columna 'month' no existe: month_valid = 0") |
| 680 | + |
| 681 | + # ── DAYS ──────────────────────────────────────────────────────────────────────── |
| 682 | + if 'day' in df.columns: |
| 683 | + df['start_day'] = df['start_date'].dt.day |
| 684 | + df['end_day'] = df['end_date'].dt.day |
| 685 | + |
| 686 | + df['day_valid'] = df['day'].between(df['start_day'], df['end_day']) |
| 687 | + valid_days = int(df['day_valid'].sum()) |
| 688 | + percentage_day = valid_days / total_data * 100 |
| 689 | + |
| 690 | + logger.debug(f"Filas con día válido: {valid_days}/{total_data} ({percentage_day:.2f}%)") |
| 691 | + else: |
| 692 | + logger.debug("Columna 'day' no existe: day_valid = 0") |
| 693 | + |
| 694 | + # ── VALIDACIÓN FORMATO FECHA ──────────────────────────────────────────────────── |
| 695 | + # start/end validas si no son NaT |
| 696 | + df['start_date_valid'] = df['start_date'].notna() |
| 697 | + df['end_date_valid'] = df['end_date'].notna() |
| 698 | + valid_both = int((df['start_date_valid'] & df['end_date_valid']).sum()) |
| 699 | + percentage_incorrect_dates = 100 - (valid_both / total_data * 100) |
| 700 | + |
| 701 | + logger.debug( |
| 702 | + f"Rango fechas válidas: {valid_both}/{total_data} " |
| 703 | + f"({100-percentage_incorrect_dates:.2f}% correctas, " |
| 704 | + f"{percentage_incorrect_dates:.2f}% incorrectas)" |
| 705 | + ) |
| 706 | + |
| 707 | + # ── SCORE FINAL ──────────────────────────────────────────────────────────────── |
| 708 | + percentage_temporal = ( |
| 709 | + 0.11 * percentage_years |
| 710 | + + 0.07 * percentage_months |
| 711 | + + 0.02 * percentage_day |
| 712 | + - 0.15 * percentage_incorrect_dates |
619 | 713 | )
|
620 | 714 |
|
621 | 715 | return {
|
622 |
| - "Temporal": percentaje_temporal, |
623 |
| - "Years": 0.11 * percentaje_years, |
624 |
| - "Months": 0.07 * percentaje_months, |
625 |
| - "Days": 0.02 * percentaje_days, |
626 |
| - "IncorrectDates": 0.15 * percentaje_incorrect_dates, |
| 716 | + "Temporal": percentage_temporal, |
| 717 | + "Years": 0.11 * percentage_years, |
| 718 | + "Months": 0.07 * percentage_months, |
| 719 | + "Days": 0.02 * percentage_day, |
| 720 | + "IncorrectDates": 0.15 * percentage_incorrect_dates, |
627 | 721 | }
|
628 | 722 |
|
629 | 723 |
|
|
0 commit comments