Skip to content

Commit 943e143

Browse files
committed
Temporal dimension fixed
1 parent d63b77a commit 943e143

File tree

1 file changed

+158
-64
lines changed

1 file changed

+158
-64
lines changed

plugins/gbif/gbif_data.py

Lines changed: 158 additions & 64 deletions
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,7 @@
1111
from concurrent.futures import ThreadPoolExecutor
1212

1313
import geopandas as gpd
14+
import numpy as np
1415
import pandas as pd
1516
import pycountry
1617
import requests
@@ -146,6 +147,38 @@ def gbif_doi_download(doi: str, timeout=-1, auth=None):
146147
except Exception as e:
147148
logger.debug(f"ERROR Searching Data: {e}")
148149
return download_dict
150+
151+
logger.debug("Intentando IPT")
152+
endpoints = search_request["endpoints"]
153+
logger.debug(endpoints)
154+
for ep in endpoints:
155+
logger.debug("Probando endpoints")
156+
if ep.get("type") == "DWC_ARCHIVE" and ep.get("url"):
157+
url = ep["url"]
158+
logger.debug(f"Intentando descarga directa desde endpoint DWC_ARCHIVE: {url}")
159+
try:
160+
os.makedirs(os.path.dirname(download_dict["path"]), exist_ok=True)
161+
# Hacemos el GET con timeout razonable (por ejemplo 60s)
162+
with requests.get(url, stream=True, timeout=60) as resp:
163+
resp.raise_for_status()
164+
with open(download_dict["path"], "wb") as f:
165+
for chunk in resp.iter_content(chunk_size=8192):
166+
if chunk:
167+
f.write(chunk)
168+
# Si llegamos aquí, la descarga fue exitosa
169+
download_dict.update({
170+
"download_url": url,
171+
"download_method": "endpoint",
172+
# opcionalmente, capturamos size si viene en headers
173+
"size": int(resp.headers.get("content-length", 0))
174+
})
175+
logger.debug("Descarga directa exitosa.")
176+
return download_dict
177+
except Exception as e:
178+
logger.debug(f"ERROR descarga directa desde endpoint: {e}")
179+
# si falla, seguimos al siguiente endpoint o al fallback
180+
continue
181+
149182

150183
# Genera la solicitud de descarga
151184
logger.debug("Solicitud de Descarga")
@@ -224,7 +257,7 @@ def ICA(filepath):
224257
"countryCode",
225258
"coordinateUncertaintyInMeters",
226259
]
227-
temporal_columns = ["eventDate"]
260+
temporal_columns = ["eventDate", "verbatimEventDate", "year", "month", "day"]
228261
try:
229262
df = results.pd_read(
230263
results.core_file_location,
@@ -546,84 +579,145 @@ def temporal_percentajes(df):
546579
Temporal: 63.45%
547580
{'Temporal': 63.45, 'Years': 25.6, 'Months': 15.2, 'Days': 18.9, 'IncorrectDates': 3.75}
548581
"""
549-
# Total de ocurrencias
550-
total_data = len(df)
551582

552-
def safe_date(date):
553-
try:
554-
return str(pd.to_datetime(date))
555-
except Exception as e:
556-
# print(e)
557-
return date
583+
# ── 0) Unificar eventDate: si existe verbatimEventDate y sus valores no están vacíos,
584+
# reemplazar en eventDate sólo donde éste sea nulo o cadena vacía.
585+
# ── 0) Unificar eventDate / verbatimEventDate ──────────────────────────────
586+
587+
# Asegurarnos de tener copia y detectar columnas
588+
df = df.copy()
589+
has_ev = 'eventDate' in df.columns
590+
has_verb = 'verbatimEventDate' in df.columns
591+
592+
if has_verb and not has_ev:
593+
# Sólo verbatimEventDate existe → lo renombramos
594+
df = df.rename(columns={'verbatimEventDate': 'eventDate'})
595+
elif has_ev and has_verb:
596+
ev = df['eventDate']
597+
verb = df['verbatimEventDate']
598+
# Convertimos a str y recortamos espacios para detectar "" y "nan"
599+
ev_str = ev.astype(str).fillna('').str.strip().str.lower()
600+
# Mascara de “eventDate válido”: no nulo, no vacío, no "nan"
601+
valid_ev = ev.notna() & (ev_str != '') & (ev_str != 'nan')
602+
# Donde valid_ev es True, mantenemos ev; donde es False, tomamos verbatim
603+
df['eventDate'] = ev.where(valid_ev, verb)
604+
# Y quitamos ya la columna verbatim
605+
df = df.drop(columns=['verbatimEventDate'])
606+
# si sólo existía eventDate, no tocamos nada
607+
608+
609+
# ── 1) y siguientes: idéntico al anterior...
610+
total_data = len(df)
558611

559-
# Columna de fechas
560-
dates = df[df.eventDate.notnull()].copy()
561-
if dates.empty:
612+
# Si no hay ninguna fecha, devolvemos la penalización directa
613+
if df['eventDate'].notna().sum() == 0:
562614
return {
563-
"Temporal": -15 * 0.2,
615+
"Temporal": -15,
564616
"Years": 0,
565617
"Months": 0,
566618
"Days": 0,
567619
"IncorrectDates": -15,
568620
}
569-
dates["date"] = dates.eventDate.apply(safe_date)
570621

571-
# Porcentaje de años validos
572-
try:
573-
dates["year"] = df[df.year.notnull()].copy()
574-
percentaje_years = (
575-
sum((dates.year >= 0) & (dates.year <= datetime.date.today().year))
576-
/ total_data
577-
* 100
578-
)
579-
except Exception as e:
580-
logger.debug(f"ERROR year - {e}")
581-
percentaje_years = 0
622+
# Convertimos year/month/day a numérico (NaN si falla)
623+
if 'year' in df.columns:
624+
df['year'] = pd.to_numeric(df['year'], errors='coerce')
625+
if 'month' in df.columns:
626+
df['month'] = pd.to_numeric(df['month'], errors='coerce')
627+
if 'day' in df.columns:
628+
df['day'] = pd.to_numeric(df['day'], errors='coerce')
629+
630+
# 1) Separa eventDate en hasta dos trozos
631+
date_splits = (
632+
df["eventDate"]
633+
.astype(str)
634+
.str.strip()
635+
.str.split("/", n=1, expand=True)
636+
)
637+
# Si sólo salió una columna, duplicarla
638+
if date_splits.shape[1] == 1:
639+
date_splits[1] = date_splits[0]
640+
# Rellenar vacíos o NaN de la segunda con la primera
641+
date_splits[1] = np.where(
642+
date_splits[1].eq("") | date_splits[1].isna(),
643+
date_splits[0],
644+
date_splits[1]
645+
)
646+
# 2) Parseo a datetime (NaT si falla)
647+
df["start_date"] = pd.to_datetime(date_splits[0], errors="coerce")
648+
df["end_date"] = pd.to_datetime(date_splits[1], errors="coerce")
582649

583-
# Porcentaje de meses validos
584-
try:
585-
dates["month"] = df[df.month.notnull()].copy()
586-
percentaje_months = (
587-
sum((dates.month >= 1) & (dates.month <= 12)) / total_data * 100
588-
)
589-
except Exception as e:
590-
logger.debug(f"ERROR month - {e}")
591-
percentaje_months = 0
650+
# Preparamos variables de salida
651+
percentage_years = percentage_months = percentage_day = 0
652+
percentage_incorrect_dates = 100
592653

593-
# Porcentaje de días validos
594-
try:
595-
dates["day"] = df[df.day.notnull()].copy()
596-
percentaje_days = sum((dates.day >= 1) & (dates.day <= 31)) / total_data * 100
597-
except Exception as e:
598-
logger.debug(f"ERROR day - {e}")
599-
percentaje_days = 0
654+
# ── YEARS ───────────────────────────────────────────────────────────────────────
655+
if 'year' in df.columns:
656+
df['start_year'] = df['start_date'].dt.year
657+
df['end_year'] = df['end_date'].dt.year
600658

601-
# Porcentaje de fechas incorrectas
602-
try:
603-
dates["correct"] = dates.date.apply(
604-
lambda x: bool(
605-
re.match(r"^\d{4}-\d{2}-\d{2} \d{2}:\d{2}:\d{2}$", x.strip())
606-
)
607-
)
608-
percentaje_incorrect_dates = sum(~dates.correct) / total_data * 100
609-
except Exception as e:
610-
logger.debug(f"ERROR incorrect dates - {e}")
611-
percentaje_incorrect_dates = 0
612-
613-
# Porcentaje total de calidad temporal combinando los porcentajes ponderados
614-
percentaje_temporal = (
615-
0.11 * percentaje_years
616-
+ 0.07 * percentaje_months
617-
+ 0.02 * percentaje_days
618-
- 0.15 * percentaje_incorrect_dates
659+
df['year_valid'] = df['year'].between(df['start_year'], df['end_year'])
660+
valid_years = int(df['year_valid'].sum())
661+
percentage_years = valid_years / total_data * 100
662+
663+
logger.debug(f"Filas con año válido: {valid_years}/{total_data} ({percentage_years:.2f}%)")
664+
else:
665+
logger.debug("Columna 'year' no existe: ano_valid = 0")
666+
667+
# ── MONTHS ──────────────────────────────────────────────────────────────────────
668+
if 'month' in df.columns:
669+
df['start_month'] = df['start_date'].dt.month
670+
df['end_month'] = df['end_date'].dt.month
671+
672+
# Si start_month o end_month son NaN, la comparación dará False
673+
df['month_valid'] = df['month'].between(df['start_month'], df['end_month'])
674+
valid_months = int(df['month_valid'].sum())
675+
percentage_months = valid_months / total_data * 100
676+
677+
logger.debug(f"Filas con mes válido: {valid_months}/{total_data} ({percentage_months:.2f}%)")
678+
else:
679+
logger.debug("Columna 'month' no existe: month_valid = 0")
680+
681+
# ── DAYS ────────────────────────────────────────────────────────────────────────
682+
if 'day' in df.columns:
683+
df['start_day'] = df['start_date'].dt.day
684+
df['end_day'] = df['end_date'].dt.day
685+
686+
df['day_valid'] = df['day'].between(df['start_day'], df['end_day'])
687+
valid_days = int(df['day_valid'].sum())
688+
percentage_day = valid_days / total_data * 100
689+
690+
logger.debug(f"Filas con día válido: {valid_days}/{total_data} ({percentage_day:.2f}%)")
691+
else:
692+
logger.debug("Columna 'day' no existe: day_valid = 0")
693+
694+
# ── VALIDACIÓN FORMATO FECHA ────────────────────────────────────────────────────
695+
# start/end validas si no son NaT
696+
df['start_date_valid'] = df['start_date'].notna()
697+
df['end_date_valid'] = df['end_date'].notna()
698+
valid_both = int((df['start_date_valid'] & df['end_date_valid']).sum())
699+
percentage_incorrect_dates = 100 - (valid_both / total_data * 100)
700+
701+
logger.debug(
702+
f"Rango fechas válidas: {valid_both}/{total_data} "
703+
f"({100-percentage_incorrect_dates:.2f}% correctas, "
704+
f"{percentage_incorrect_dates:.2f}% incorrectas)"
705+
)
706+
707+
# ── SCORE FINAL ────────────────────────────────────────────────────────────────
708+
percentage_temporal = (
709+
0.11 * percentage_years
710+
+ 0.07 * percentage_months
711+
+ 0.02 * percentage_day
712+
- 0.15 * percentage_incorrect_dates
619713
)
620714

621715
return {
622-
"Temporal": percentaje_temporal,
623-
"Years": 0.11 * percentaje_years,
624-
"Months": 0.07 * percentaje_months,
625-
"Days": 0.02 * percentaje_days,
626-
"IncorrectDates": 0.15 * percentaje_incorrect_dates,
716+
"Temporal": percentage_temporal,
717+
"Years": 0.11 * percentage_years,
718+
"Months": 0.07 * percentage_months,
719+
"Days": 0.02 * percentage_day,
720+
"IncorrectDates": 0.15 * percentage_incorrect_dates,
627721
}
628722

629723

0 commit comments

Comments
 (0)