1+ #!/usr/bin/env python3
2+ # -*- coding: utf-8 -*-
3+ """
4+ Created on Thu Mar 6 13:59:07 2025
5+
6+ @author: katherineanne
7+ """
8+ # %% Import modules
9+
10+ import requests
11+ import numpy as np
12+ from netCDF4 import Dataset , num2date
13+ import matplotlib .pyplot as plt
14+ import pandas as pd
15+ import os
16+ from datetime import datetime , date , timedelta
17+ import pyarrow as pa
18+ import pyarrow .parquet as pq
19+ import pyarrow .dataset as ds
20+ import CCMMF_Irrigation_DataDownload
21+ import CCMMF_Irrigation_CalcVis
22+ import CCMMF_Irrigation_Events
23+
24+
25+ # %% Define multi use variables
26+
27+ # Define years to look at
28+ years = list (range (2016 , 2026 ))
29+
30+ # Define main folder
31+ main_folder = '/projectnb/dietzelab/ccmmf/management/irrigation/'
32+
33+ # Define folder name for csv files
34+ csv_folder = main_folder + 'WaterBalanceCSV/'
35+
36+ # Define the name of the parquet filename
37+ pq_filename = main_folder + 'CCMMF_Irrigation_Parquet'
38+
39+ # %% Loading data
40+
41+ # Read in parquet file
42+ # Load the full dataset
43+ dataset = ds .dataset (pq_filename , format = "parquet" , partitioning = 'hive' )
44+ table = dataset .to_table ()
45+ parquet_df = table .to_pandas ()
46+ days_to_download = 0
47+
48+ # Group by the location column and convert to dictionary
49+ data_dict = {location : location_df for location , location_df in parquet_df .groupby ("location" )}
50+
51+ # %% Check current date with most current downloaded data
52+
53+ # Delete the current CHIRPS file for this year
54+ # This will ensure we read in the new data for the current date
55+ # We only do this if the data is not up to date
56+ cur_year = datetime .now ().year
57+ today = datetime .now ().date ()
58+ chirps_filename = f'{ main_folder } chirps-v2.0.{ cur_year } .days_p05.nc'
59+
60+ if os .path .exists (chirps_filename ):
61+ with Dataset (chirps_filename , 'r' ) as nc :
62+
63+ time_var = nc .variables ['time' ]
64+ dates = num2date (time_var [:], units = time_var .units )
65+ most_recent = max (dates )
66+ most_recent_date = date (most_recent .year , most_recent .month , most_recent .day )
67+
68+ if most_recent_date != today :
69+ print ('Deleted' )
70+ days_to_download = (today - most_recent_date ).days
71+ os .remove (chirps_filename )
72+
73+ # %% Define locations
74+
75+ # Read in all lat lons
76+ df_lat_lon = pd .read_csv (f'{ main_folder } design_points.csv' )
77+
78+ # Handle duplicates
79+ df_lat_lon = df_lat_lon .drop_duplicates ()
80+
81+ # %% Iterate through locations and download data for each
82+
83+ for row_number in range (35 ):
84+
85+ # Load location data
86+ latitude = df_lat_lon ['lat' ].iloc [row_number ]
87+ longitude = df_lat_lon ['lon' ].iloc [row_number ]
88+ location = df_lat_lon ['id' ].iloc [row_number ]
89+
90+ # Create CSV filename
91+ csv_filename = f'{ csv_folder } CCMMR_Water_Balance_{ latitude } _{ longitude } .csv'
92+
93+ if location in data_dict :
94+
95+ df = data_dict [location ]
96+
97+ # If we have not downloaded data for today yet...
98+ if days_to_download != 0 :
99+ # Download new data
100+ start_date = today - timedelta (days = days_to_download )
101+ new_df = CCMMF_Irrigation_DataDownload .new_data_entry_API (latitude , longitude ,
102+ [start_date .year , cur_year ],
103+ csv_folder , start_date , today )
104+
105+ # Concatenate with already saved data
106+ old_df = data_dict [location ]
107+ df = pd .concat ([new_df , old_df ], ignore_index = True )
108+ df = df .sort_values (by = 'time' )
109+ data_dict [location ] = df
110+
111+ # Save data
112+ df .to_csv (csv_filename , index = False )
113+
114+ # Check that all years have been read in
115+ df ['time' ] = pd .to_datetime (df ['time' ])
116+ df_years = df ['time' ].dt .year .unique ().tolist ()
117+
118+ if set (df_years ) != set (years ):
119+
120+ # Years in what years we want but not in saved data
121+ # Does not care if there are values in saved data that are not in wanted years
122+ not_saved_years = set (years ) - set (df_years )
123+ not_saved_years = list (not_saved_years )
124+
125+ # Download data and calculate for new years
126+ new_df = CCMMF_Irrigation_DataDownload .new_data_entry_API (latitude , longitude ,
127+ not_saved_years , csv_folder )
128+
129+ # Concatenate with already saved data
130+ old_df = data_dict [location ]
131+ df = pd .concat ([new_df , old_df ], ignore_index = True )
132+ df = df .sort_values (by = 'time' )
133+ data_dict [location ] = df
134+
135+ # Save data
136+ df .to_csv (csv_filename , index = False )
137+
138+ # The location is not in the saved dictionary
139+ else :
140+ # Download and calculate if it doesn't exist
141+ df = CCMMF_Irrigation_DataDownload .new_data_entry_API (latitude , longitude ,
142+ years , csv_folder )
143+ data_dict [location ] = df
144+
145+ # Save data
146+ df .to_csv (csv_filename , index = False )
147+
148+ # %% Create Event Files
149+
150+ CCMMF_Irrigation_Events .file_creation (data_dict )
151+
152+ # %% Write to parquet
153+
154+ for location , loc_df in data_dict .items ():
155+ loc_df ['location' ] = location
156+ table = pa .Table .from_pandas (loc_df )
157+ pq .write_to_dataset (table , root_path = pq_filename , partition_cols = ['location' , 'year' ])
0 commit comments