|
16 | 16 | """ |
17 | 17 |
|
18 | 18 | import os |
19 | | -from datetime import timedelta |
| 19 | +from datetime import date, timedelta |
| 20 | +from itertools import chain |
20 | 21 |
|
21 | 22 | import pendulum |
22 | 23 | from airflow import DAG |
23 | 24 | from airflow.decorators import task |
| 25 | +from airflow.models import Variable |
| 26 | +from satellite import ADM2, request |
| 27 | +from sqlalchemy import create_engine, text |
24 | 28 |
|
25 | 29 | env = os.getenv |
26 | 30 | email_main = env("EMAIL_MAIN") |
|
40 | 44 | dag_id="COPERNICUS_BRASIL", |
41 | 45 | description="ETL of weather data for Brazil", |
42 | 46 | tags=["Brasil", "Copernicus"], |
43 | | - schedule="@daily", |
| 47 | + schedule="@monthly", |
44 | 48 | default_args=DEFAULT_ARGS, |
45 | | - start_date=pendulum.datetime(2024, 1, 1), |
| 49 | + start_date=pendulum.datetime(2000, 1, 1), |
| 50 | + end_date=pendulum.datetime(2024, 1, 1), |
46 | 51 | catchup=True, |
47 | 52 | max_active_runs=14, |
48 | | -): |
49 | | - from airflow.models import Variable |
50 | | - |
| 53 | +) as dag: |
51 | 54 | DATE = "{{ ds }}" # DAG execution date |
52 | | - DATA_DIR = "/tmp/copernicus" |
53 | 55 | KEY = Variable.get("cdsapi_key", deserialize_json=True) |
54 | 56 | URI = Variable.get("psql_main_uri", deserialize_json=True) |
55 | 57 |
|
56 | | - # fmt: off |
57 | | - @task.external_python( |
58 | | - task_id="daily_fetch", |
59 | | - python="/opt/py310/bin/python3.10" |
60 | | - ) |
61 | | - # fmt: on |
62 | | - def extract_transform_load( |
63 | | - date: str, data_dir: str, api_key: str, psql_uri: str |
64 | | - ) -> str: |
65 | | - """ |
66 | | - Due to incompatibility issues between Airflow's Python version |
67 | | - and the satellite-weather-downloader (SWD) package, this task |
68 | | - will be executed in a dedicated virtual environment, which |
69 | | - includes a pre-installed Python3.10 interpreter within the |
70 | | - container. All imports must be within the scope of the task, |
71 | | - and XCom sharing between tasks is not allowed. |
72 | | -
|
73 | | - The task is designed to receive the execution date and download |
74 | | - the weather dataset for that specific day. After downloading, |
75 | | - the data is transformed using Xarray and inserted into the Main |
76 | | - Postgres DB, as specified in the .env file, in the form of a |
77 | | - DataFrame containing the weather information. |
78 | | - """ |
79 | | - from datetime import timedelta |
80 | | - from itertools import chain |
81 | | - from pathlib import Path |
82 | | - |
83 | | - from dateutil import parser |
84 | | - from satellite import downloader as sat_d |
85 | | - from satellite import weather as sat_w |
86 | | - from satellite.weather.brazil.extract_latlons import MUNICIPALITIES |
87 | | - from sqlalchemy import create_engine, text |
88 | | - |
89 | | - start_date = parser.parse(str(date)) |
90 | | - max_update_delay = start_date - timedelta(days=6) |
| 58 | + @task |
| 59 | + def fetch_ds(locale, dt, uri, api_key): |
| 60 | + tablename = f"copernicus_{locale.lower()}" |
| 61 | + engine = create_engine(uri) |
| 62 | + dt = date.fromisoformat(dt) - timedelta(days=5) |
91 | 63 |
|
92 | | - with create_engine(psql_uri["PSQL_MAIN_URI"]).connect() as conn: |
| 64 | + with engine.connect() as conn: |
93 | 65 | cur = conn.execute( |
94 | 66 | text( |
95 | | - "SELECT geocodigo FROM weather.copernicus_brasil" |
96 | | - f" WHERE date = '{str(max_update_delay.date())}'" |
| 67 | + f"SELECT geocode FROM weather.{tablename}" |
| 68 | + f" WHERE date = '{str(dt)}'" |
97 | 69 | ) |
98 | 70 | ) |
99 | 71 | table_geocodes = set(chain(*cur.fetchall())) |
100 | 72 |
|
101 | | - all_geocodes = set([mun["geocodigo"] for mun in MUNICIPALITIES]) |
| 73 | + all_geocodes = set([adm.code for adm in ADM2.filter(adm0=locale)]) |
102 | 74 | geocodes = all_geocodes.difference(table_geocodes) |
103 | 75 | print("TABLE_GEO ", f"[{len(table_geocodes)}]: ", table_geocodes) |
104 | 76 | print("DIFF_GEO: ", f"[{len(geocodes)}]: ", geocodes) |
105 | 77 |
|
106 | | - if not geocodes: |
107 | | - return "There is no geocode to fetch" |
108 | | - |
109 | | - # Downloads daily dataset |
110 | | - netcdf_file = sat_d.download_br_netcdf( |
111 | | - date=str(max_update_delay.date()), |
112 | | - data_dir=data_dir, |
113 | | - user_key=api_key["CDSAPI_KEY"], |
114 | | - ) |
115 | | - |
116 | | - print(f"Handling {netcdf_file}") |
117 | | - |
118 | | - # Reads the NetCDF4 file using Xarray |
119 | | - ds = sat_w.load_dataset(netcdf_file) |
120 | | - |
121 | | - with create_engine(psql_uri["PSQL_MAIN_URI"]).connect() as conn: |
122 | | - ds.copebr.to_sql( |
123 | | - tablename="copernicus_brasil", |
124 | | - schema="weather", |
125 | | - geocodes=list(geocodes), |
126 | | - con=conn, |
127 | | - ) |
128 | | - |
129 | | - # Deletes the NetCDF4 file |
130 | | - Path(netcdf_file).unlink(missing_ok=True) |
131 | | - |
132 | | - return f"{len(geocodes)} inserted into DB." |
133 | | - |
134 | | - # Instantiate the Task |
135 | | - ETL = extract_transform_load(DATE, DATA_DIR, KEY, URI) |
136 | | - |
137 | | - ETL # Execute |
| 78 | + with request.reanalysis_era5_land( |
| 79 | + str(dt).replace("-", "_") + locale, |
| 80 | + api_token=api_key, |
| 81 | + date=str(dt), |
| 82 | + locale=locale, |
| 83 | + ) as ds: |
| 84 | + for adm in ADM2.filter(adm0=locale): |
| 85 | + with engine.connect() as conn: |
| 86 | + ds.cope.to_sql(adm, conn, tablename, "weather") |
| 87 | + |
| 88 | + fetch_ds("BRA", DATE, URI["PSQL_MAIN_URI"], KEY["CDSAPI_KEY"]) |
0 commit comments