Temporal dimension fixed

ferag · ferag · commit 943e143a9e8f · 2025-05-12T12:57:13.000+02:00
diff --git a/plugins/gbif/gbif_data.py b/plugins/gbif/gbif_data.py
@@ -11,6 +11,7 @@
 from concurrent.futures import ThreadPoolExecutor
 
 import geopandas as gpd
+import numpy as np
 import pandas as pd
 import pycountry
 import requests
@@ -146,6 +147,38 @@ def gbif_doi_download(doi: str, timeout=-1, auth=None):
     except Exception as e:
         logger.debug(f"ERROR Searching Data: {e}")
         return download_dict
+    
+    logger.debug("Intentando IPT")
+    endpoints = search_request["endpoints"]
+    logger.debug(endpoints)
+    for ep in endpoints:
+        logger.debug("Probando endpoints")
+        if ep.get("type") == "DWC_ARCHIVE" and ep.get("url"):
+            url = ep["url"]
+            logger.debug(f"Intentando descarga directa desde endpoint DWC_ARCHIVE: {url}")
+            try:
+                os.makedirs(os.path.dirname(download_dict["path"]), exist_ok=True)
+                # Hacemos el GET con timeout razonable (por ejemplo 60s)
+                with requests.get(url, stream=True, timeout=60) as resp:
+                    resp.raise_for_status()
+                    with open(download_dict["path"], "wb") as f:
+                        for chunk in resp.iter_content(chunk_size=8192):
+                            if chunk:
+                                f.write(chunk)
+                # Si llegamos aquí, la descarga fue exitosa
+                download_dict.update({
+                    "download_url": url,
+                    "download_method": "endpoint",
+                    # opcionalmente, capturamos size si viene en headers
+                    "size": int(resp.headers.get("content-length", 0))
+                })
+                logger.debug("Descarga directa exitosa.")
+                return download_dict
+            except Exception as e:
+                logger.debug(f"ERROR descarga directa desde endpoint: {e}")
+                # si falla, seguimos al siguiente endpoint o al fallback
+                continue
+
 
     # Genera la solicitud de descarga
     logger.debug("Solicitud de Descarga")
@@ -224,7 +257,7 @@ def ICA(filepath):
             "countryCode",
             "coordinateUncertaintyInMeters",
         ]
-        temporal_columns = ["eventDate"]
+        temporal_columns = ["eventDate", "verbatimEventDate", "year", "month", "day"]
         try:
             df = results.pd_read(
                 results.core_file_location,
@@ -546,84 +579,145 @@ def temporal_percentajes(df):
     Temporal: 63.45%
     {'Temporal': 63.45, 'Years': 25.6, 'Months': 15.2, 'Days': 18.9, 'IncorrectDates': 3.75}
     """
-    # Total de ocurrencias
-    total_data = len(df)
 
-    def safe_date(date):
-        try:
-            return str(pd.to_datetime(date))
-        except Exception as e:
-            # print(e)
-            return date
+    # ── 0) Unificar eventDate: si existe verbatimEventDate y sus valores no están vacíos,
+    # reemplazar en eventDate sólo donde éste sea nulo o cadena vacía.
+    # ── 0) Unificar eventDate / verbatimEventDate ──────────────────────────────
+
+    # Asegurarnos de tener copia y detectar columnas
+    df = df.copy()
+    has_ev   = 'eventDate' in df.columns
+    has_verb = 'verbatimEventDate' in df.columns
+
+    if has_verb and not has_ev:
+        # Sólo verbatimEventDate existe → lo renombramos
+        df = df.rename(columns={'verbatimEventDate': 'eventDate'})
+    elif has_ev and has_verb:
+        ev   = df['eventDate']
+        verb = df['verbatimEventDate']
+        # Convertimos a str y recortamos espacios para detectar "" y "nan"
+        ev_str = ev.astype(str).fillna('').str.strip().str.lower()
+        # Mascara de “eventDate válido”: no nulo, no vacío, no "nan"
+        valid_ev = ev.notna() & (ev_str != '') & (ev_str != 'nan')
+        # Donde valid_ev es True, mantenemos ev; donde es False, tomamos verbatim
+        df['eventDate'] = ev.where(valid_ev, verb)
+        # Y quitamos ya la columna verbatim
+        df = df.drop(columns=['verbatimEventDate'])
+    # si sólo existía eventDate, no tocamos nada
+
+
+    # ── 1) y siguientes: idéntico al anterior...
+    total_data = len(df)
 
-    # Columna de fechas
-    dates = df[df.eventDate.notnull()].copy()
-    if dates.empty:
+    # Si no hay ninguna fecha, devolvemos la penalización directa
+    if df['eventDate'].notna().sum() == 0:
         return {
-            "Temporal": -15 * 0.2,
+            "Temporal": -15,
             "Years": 0,
             "Months": 0,
             "Days": 0,
             "IncorrectDates": -15,
         }
-    dates["date"] = dates.eventDate.apply(safe_date)
 
-    # Porcentaje de años validos
-    try:
-        dates["year"] = df[df.year.notnull()].copy()
-        percentaje_years = (
-            sum((dates.year >= 0) & (dates.year <= datetime.date.today().year))
-            / total_data
-            * 100
-        )
-    except Exception as e:
-        logger.debug(f"ERROR year - {e}")
-        percentaje_years = 0
+    # Convertimos year/month/day a numérico (NaN si falla)
+    if 'year' in df.columns:
+        df['year'] = pd.to_numeric(df['year'], errors='coerce')
+    if 'month' in df.columns:
+        df['month'] = pd.to_numeric(df['month'], errors='coerce')
+    if 'day' in df.columns:
+        df['day'] = pd.to_numeric(df['day'], errors='coerce')
+
+    # 1) Separa eventDate en hasta dos trozos
+    date_splits = (
+        df["eventDate"]
+        .astype(str)
+        .str.strip()
+        .str.split("/", n=1, expand=True)
+    )
+    # Si sólo salió una columna, duplicarla
+    if date_splits.shape[1] == 1:
+        date_splits[1] = date_splits[0]
+    # Rellenar vacíos o NaN de la segunda con la primera
+    date_splits[1] = np.where(
+        date_splits[1].eq("") | date_splits[1].isna(),
+        date_splits[0],
+        date_splits[1]
+    )
+    # 2) Parseo a datetime (NaT si falla)
+    df["start_date"] = pd.to_datetime(date_splits[0], errors="coerce")
+    df["end_date"]   = pd.to_datetime(date_splits[1], errors="coerce")
 
-    # Porcentaje de meses validos
-    try:
-        dates["month"] = df[df.month.notnull()].copy()
-        percentaje_months = (
-            sum((dates.month >= 1) & (dates.month <= 12)) / total_data * 100
-        )
-    except Exception as e:
-        logger.debug(f"ERROR month - {e}")
-        percentaje_months = 0
+    # Preparamos variables de salida
+    percentage_years = percentage_months = percentage_day = 0
+    percentage_incorrect_dates = 100
 
-    # Porcentaje de días validos
-    try:
-        dates["day"] = df[df.day.notnull()].copy()
-        percentaje_days = sum((dates.day >= 1) & (dates.day <= 31)) / total_data * 100
-    except Exception as e:
-        logger.debug(f"ERROR day - {e}")
-        percentaje_days = 0
+    # ── YEARS ───────────────────────────────────────────────────────────────────────
+    if 'year' in df.columns:
+        df['start_year'] = df['start_date'].dt.year
+        df['end_year']   = df['end_date'].dt.year
 
-    # Porcentaje de fechas incorrectas
-    try:
-        dates["correct"] = dates.date.apply(
-            lambda x: bool(
-                re.match(r"^\d{4}-\d{2}-\d{2} \d{2}:\d{2}:\d{2}$", x.strip())
-            )
-        )
-        percentaje_incorrect_dates = sum(~dates.correct) / total_data * 100
-    except Exception as e:
-        logger.debug(f"ERROR incorrect dates - {e}")
-        percentaje_incorrect_dates = 0
-
-    # Porcentaje total de calidad temporal combinando los porcentajes ponderados
-    percentaje_temporal = (
-        0.11 * percentaje_years
-        + 0.07 * percentaje_months
-        + 0.02 * percentaje_days
-        - 0.15 * percentaje_incorrect_dates
+        df['year_valid'] = df['year'].between(df['start_year'], df['end_year'])
+        valid_years = int(df['year_valid'].sum())
+        percentage_years = valid_years / total_data * 100
+
+        logger.debug(f"Filas con año válido: {valid_years}/{total_data} ({percentage_years:.2f}%)")
+    else:
+        logger.debug("Columna 'year' no existe: ano_valid = 0")
+
+    # ── MONTHS ──────────────────────────────────────────────────────────────────────
+    if 'month' in df.columns:
+        df['start_month'] = df['start_date'].dt.month
+        df['end_month']   = df['end_date'].dt.month
+
+        # Si start_month o end_month son NaN, la comparación dará False
+        df['month_valid'] = df['month'].between(df['start_month'], df['end_month'])
+        valid_months = int(df['month_valid'].sum())
+        percentage_months = valid_months / total_data * 100
+
+        logger.debug(f"Filas con mes válido: {valid_months}/{total_data} ({percentage_months:.2f}%)")
+    else:
+        logger.debug("Columna 'month' no existe: month_valid = 0")
+
+    # ── DAYS ────────────────────────────────────────────────────────────────────────
+    if 'day' in df.columns:
+        df['start_day'] = df['start_date'].dt.day
+        df['end_day']   = df['end_date'].dt.day
+
+        df['day_valid'] = df['day'].between(df['start_day'], df['end_day'])
+        valid_days = int(df['day_valid'].sum())
+        percentage_day = valid_days / total_data * 100
+
+        logger.debug(f"Filas con día válido: {valid_days}/{total_data} ({percentage_day:.2f}%)")
+    else:
+        logger.debug("Columna 'day' no existe: day_valid = 0")
+
+    # ── VALIDACIÓN FORMATO FECHA ────────────────────────────────────────────────────
+    # start/end validas si no son NaT
+    df['start_date_valid'] = df['start_date'].notna()
+    df['end_date_valid']   = df['end_date'].notna()
+    valid_both = int((df['start_date_valid'] & df['end_date_valid']).sum())
+    percentage_incorrect_dates = 100 - (valid_both / total_data * 100)
+
+    logger.debug(
+        f"Rango fechas válidas: {valid_both}/{total_data} "
+        f"({100-percentage_incorrect_dates:.2f}% correctas, "
+        f"{percentage_incorrect_dates:.2f}% incorrectas)"
+    )
+
+    # ── SCORE FINAL ────────────────────────────────────────────────────────────────
+    percentage_temporal = (
+        0.11 * percentage_years
+        + 0.07 * percentage_months
+        + 0.02 * percentage_day
+        - 0.15 * percentage_incorrect_dates
     )
 
     return {
-        "Temporal": percentaje_temporal,
-        "Years": 0.11 * percentaje_years,
-        "Months": 0.07 * percentaje_months,
-        "Days": 0.02 * percentaje_days,
-        "IncorrectDates": 0.15 * percentaje_incorrect_dates,
+        "Temporal": percentage_temporal,
+        "Years": 0.11 * percentage_years,
+        "Months": 0.07 * percentage_months,
+        "Days": 0.02 * percentage_day,
+        "IncorrectDates": 0.15 * percentage_incorrect_dates,
     }