-
Notifications
You must be signed in to change notification settings - Fork 869
Expand file tree
/
Copy pathpreprocessors.py
More file actions
161 lines (128 loc) · 4.85 KB
/
preprocessors.py
File metadata and controls
161 lines (128 loc) · 4.85 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
import numpy as np
import pandas as pd
from stockstats import StockDataFrame as Sdf
from config import config
def load_dataset(*, file_name: str) -> pd.DataFrame:
"""
load csv dataset from path
:return: (df) pandas dataframe
"""
#_data = pd.read_csv(f"{config.DATASET_DIR}/{file_name}")
_data = pd.read_csv(file_name)
return _data
def data_split(df,start,end):
"""
split the dataset into training or testing using date
:param data: (df) pandas dataframe, start, end
:return: (df) pandas dataframe
"""
data = df[(df.datadate >= start) & (df.datadate < end)]
data=data.sort_values(['datadate','tic'],ignore_index=True)
#data = data[final_columns]
data.index = data.datadate.factorize()[0]
return data
def calcualte_price(df):
"""
calcualte adjusted close price, open-high-low price and volume
:param data: (df) pandas dataframe
:return: (df) pandas dataframe
"""
data = df.copy()
data = data[['datadate', 'tic', 'prccd', 'ajexdi', 'prcod', 'prchd', 'prcld', 'cshtrd']]
data['ajexdi'] = data['ajexdi'].apply(lambda x: 1 if x == 0 else x)
data['adjcp'] = data['prccd'] / data['ajexdi']
data['open'] = data['prcod'] / data['ajexdi']
data['high'] = data['prchd'] / data['ajexdi']
data['low'] = data['prcld'] / data['ajexdi']
data['volume'] = data['cshtrd']
data = data[['datadate', 'tic', 'adjcp', 'open', 'high', 'low', 'volume']]
data = data.sort_values(['tic', 'datadate'], ignore_index=True)
return data
def add_technical_indicator(df):
"""
calcualte technical indicators
use stockstats package to add technical inidactors
:param data: (df) pandas dataframe
:return: (df) pandas dataframe
"""
stock = Sdf.retype(df.copy())
stock['close'] = stock['adjcp']
unique_ticker = stock.tic.unique()
macd = pd.DataFrame()
rsi = pd.DataFrame()
cci = pd.DataFrame()
dx = pd.DataFrame()
#temp = stock[stock.tic == unique_ticker[0]]['macd']
for i in range(len(unique_ticker)):
## macd
temp_macd = stock[stock.tic == unique_ticker[i]]['macd']
temp_macd = pd.DataFrame(temp_macd)
macd = macd.append(temp_macd, ignore_index=True)
## rsi
temp_rsi = stock[stock.tic == unique_ticker[i]]['rsi_30']
temp_rsi = pd.DataFrame(temp_rsi)
rsi = rsi.append(temp_rsi, ignore_index=True)
## cci
temp_cci = stock[stock.tic == unique_ticker[i]]['cci_30']
temp_cci = pd.DataFrame(temp_cci)
cci = cci.append(temp_cci, ignore_index=True)
## adx
temp_dx = stock[stock.tic == unique_ticker[i]]['dx_30']
temp_dx = pd.DataFrame(temp_dx)
dx = dx.append(temp_dx, ignore_index=True)
df['macd'] = macd
df['rsi'] = rsi
df['cci'] = cci
df['adx'] = dx
return df
def preprocess_data():
"""data preprocessing pipeline"""
df = load_dataset(file_name=config.TRAINING_DATA_FILE)
# get data after 2009
df = df[df.datadate>=20090000]
# calcualte adjusted price
df_preprocess = calcualte_price(df)
# add technical indicators using stockstats
df_final=add_technical_indicator(df_preprocess)
# fill the missing values at the beginning
df_final.fillna(method='bfill',inplace=True)
return df_final
def add_turbulence(df):
"""
add turbulence index from a precalcualted dataframe
:param data: (df) pandas dataframe
:return: (df) pandas dataframe
"""
turbulence_index = calcualte_turbulence(df)
df = df.merge(turbulence_index, on='datadate')
df = df.sort_values(['datadate','tic']).reset_index(drop=True)
return df
def calcualte_turbulence(df):
"""calculate turbulence index based on dow 30"""
# can add other market assets
df_price_pivot=df.pivot(index='datadate', columns='tic', values='adjcp')
unique_date = df.datadate.unique()
# start after a year
start = 252
turbulence_index = [0]*start
#turbulence_index = [0]
count=0
for i in range(start,len(unique_date)):
current_price = df_price_pivot[df_price_pivot.index == unique_date[i]]
hist_price = df_price_pivot[[n in unique_date[0:i] for n in df_price_pivot.index ]]
cov_temp = hist_price.cov()
current_temp=(current_price - np.mean(hist_price,axis=0))
temp = current_temp.values.dot(np.linalg.pinv(cov_temp)).dot(current_temp.values.T)
if temp>0:
count+=1
if count>2:
turbulence_temp = temp[0][0]
else:
#avoid large outlier because of the calculation just begins
turbulence_temp=0
else:
turbulence_temp=0
turbulence_index.append(turbulence_temp)
turbulence_index = pd.DataFrame({'datadate':df_price_pivot.index,
'turbulence':turbulence_index})
return turbulence_index