Ecowatt-QC/PrepareDatasetForTraining.py at master · BenoitDelcroix/Ecowatt-QC · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
# -*- coding: utf-8 -*-
"""
Created on Fri Aug 18 09:32:05 2023

Script to prepare a complete dataset to train a model forecasting the
electricity demand difference between last and next 24 hours (24 outputs)
based on:
    - Weather (36 inputs)
    - Beginning hour of the prediction (1 input)
    - Beginning number of day of the prediction (1 input)
    - Working days (3 inputs) - previous day, current day and next day
    - Total number of inputs: 41

@author: delcr
"""
###############################################################################
# LIBRAIRIES
###############################################################################
import os
import pandas as pd
import numpy as np
from datetime import date
from workalendar.america.canada import Quebec
from tqdm import tqdm

###############################################################################
# FONCTIONS
###############################################################################

###############################################################################
# PROGRAMME PRINCIPAL
###############################################################################
if __name__ == "__main__":

    # Définir le working directory
    cwd = os.getcwd()
    os.chdir(cwd)

    # Load data
    df_use = pd.read_csv('Data/ElectricityDemand/'+\
                         '2023-08-18_AllElectricityDemandData_2019_2022.csv',
                         parse_dates=['Date'])
    df_use.drop_duplicates(subset=['Date'],inplace=True)
    df_use['Date'] = df_use['Date'] - pd.Timedelta(hours=1)
    df_use.set_index('Date',drop=True,inplace=True)
    df_weather = pd.read_csv('Data/Weather/'+\
                             '2023-08-18_AllWeatherData_2019_2022.csv',
                             parse_dates=['Unnamed: 0'])
    df_weather.drop_duplicates(subset=['Unnamed: 0'],inplace=True)
    df_weather.set_index('Unnamed: 0',drop=True,inplace=True)

    # Merge data
    df = df_weather.merge(df_use,how='left',left_index=True,right_index=True)
    df.interpolate('linear',inplace=True)

    # Definitions
    ListColumns = []
    for i in ['last','next']:
        for j in ['min','mean','max']:
            for k in ['Montreal','Quebec','Gatineau','Sherbrooke','Saguenay','Trois-Rivieres']:
                ListColumns.append(i+'24h_'+j+'Temp_'+k)
                pass
            pass
        pass
    ListColumns.extend(['BeginningHour_Forecast','BeginningDay_Forecast',
                        'Workingday_Prev','Workingday_Current',
                        'Workingday_Next'])
    ListOutput = ['delta_'+str(i+1) for i in range(24)]
    ListColumns.extend(ListOutput)

    # Create dataset
    df_vf = pd.DataFrame()
    for i in tqdm(range(24,len(df)-25,1)):
        data=[]
        current = df.iloc[i,:]
        last24 = df.iloc[i-24:i,:]
        next24 = df.iloc[i:i+24,:]
        stat_last24= last24.agg({'Montreal': ['min', 'mean', 'max'],
                                 'Quebec': ['min', 'mean', 'max'],
                                 'Gatineau': ['min', 'mean', 'max'],
                                 'Sherbrooke': ['min', 'mean', 'max'],
                                 'Saguenay': ['min', 'mean', 'max'],
                                 'Trois-Rivieres': ['min', 'mean', 'max']})
        data.extend(stat_last24.values.flatten())
        stat_next24= next24.agg({'Montreal': ['min', 'mean', 'max'],
                                 'Quebec': ['min', 'mean', 'max'],
                                 'Gatineau': ['min', 'mean', 'max'],
                                 'Sherbrooke': ['min', 'mean', 'max'],
                                 'Saguenay': ['min', 'mean', 'max'],
                                 'Trois-Rivieres': ['min', 'mean', 'max']})
        data.extend(stat_next24.values.flatten())
        data.append(next24.index[0].hour)
        data.append(next24.index[0].weekday())
        current_day = next24.index[0]
        previous_day = next24.index[0]-pd.Timedelta(days=1)
        next_day = next24.index[0]+pd.Timedelta(days=1)
        cal = Quebec()
        data.append(int(cal.is_working_day(date(previous_day.year,
                                                previous_day.month,
                                                previous_day.day))))
        data.append(int(cal.is_working_day(date(current_day.year,
                                                current_day.month,
                                                current_day.day))))
        data.append(int(cal.is_working_day(date(next_day.year,
                                                next_day.month,
                                                next_day.day))))
        diff = next24['Moyenne (MW)'].values-last24['Moyenne (MW)'].values
        data.extend(diff.flatten())
        arrData = np.array(data).reshape((1,-1))
        df_temp = pd.DataFrame(data=arrData,columns=ListColumns)
        df_vf = pd.concat([df_vf,df_temp],ignore_index=True)
        pass # end of loop for i in tqdm(range(24,len(df)-25,1))

    # Save training data as csv file
    df_vf.to_csv('ProcessedTrainingDataset.csv')

    pass # end of main program