Skip to content

Commit e04e863

Browse files
authored
Merge pull request #146 from marceloamaral/abs-scaler
Update the feature normalization to use maxAbsScaler
2 parents 1be7bf9 + 64e5598 commit e04e863

File tree

8 files changed

+35
-56
lines changed

8 files changed

+35
-56
lines changed

cmd/main.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -652,7 +652,7 @@ def plot(args):
652652
_ts_plot(power_data, power_cols, "Power source: {}".format(energy_source), output_folder, data_filename, ylabel="Power (W)")
653653
elif args.target_data == "estimate":
654654
from estimate import default_predicted_col_func
655-
from sklearn.preprocessing import MinMaxScaler
655+
from sklearn.preprocessing import MaxAbsScaler
656656

657657
best_result_map, power_labels_map, best_model_id_map, _ = estimate(args)
658658
for energy_source, best_restult in best_result_map.items():
@@ -680,13 +680,13 @@ def plot(args):
680680
# plot correlation to utilization if feature group is set
681681
if fg is not None:
682682
feature_cols = FeatureGroups[fg]
683-
scaler = MinMaxScaler()
683+
scaler = MaxAbsScaler()
684684
data[feature_cols] = best_restult[[TIMESTAMP_COL] + feature_cols].groupby([TIMESTAMP_COL]).sum().sort_index()
685685
data[feature_cols] = scaler.fit_transform(data[feature_cols])
686686
_feature_power_plot(data, model_id, ot.name, energy_source, feature_cols, actual_power_cols, predicted_power_cols, output_folder, "{}_{}_corr".format(data_filename, model_id))
687687
elif args.target_data == "error":
688688
from estimate import default_predicted_col_func
689-
from sklearn.preprocessing import MinMaxScaler
689+
from sklearn.preprocessing import MaxAbsScaler
690690
_, _, _, summary_df = estimate(args)
691691
for energy_source in energy_sources:
692692
data_filename = get_general_filename(args.target_data, energy_source, fg, ot, args.extractor, args.isolator)

src/train/profiler/generate_scaler.py

Lines changed: 7 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -25,7 +25,7 @@
2525
sys.path.append(util_path)
2626
sys.path.append(tool_path)
2727

28-
from sklearn.preprocessing import MinMaxScaler, StandardScaler
28+
from sklearn.preprocessing import MaxAbsScaler
2929

3030
from train import DefaultExtractor, node_info_column, FeatureGroups, FeatureGroup, TIMESTAMP_COL
3131
from util.train_types import SYSTEM_FEATURES
@@ -36,13 +36,10 @@
3636

3737
extractor = DefaultExtractor()
3838

39-
minmax_scaler_top_path = os.path.join(profile_path, '..', 'minmax_scaler')
40-
standard_scaler_top_path = os.path.join(profile_path, '..', 'standard_scaler')
39+
max_scaler_top_path = os.path.join(profile_path, '..', 'max_scaler')
4140

42-
if not os.path.exists(minmax_scaler_top_path):
43-
os.mkdir(minmax_scaler_top_path)
44-
if not os.path.exists(standard_scaler_top_path):
45-
os.mkdir(standard_scaler_top_path)
41+
if not os.path.exists(max_scaler_top_path):
42+
os.mkdir(max_scaler_top_path)
4643

4744
def read_query_results(query_path):
4845
results = dict()
@@ -88,9 +85,6 @@ def process(query_results):
8885
node_types = pd.unique(feature_data[node_info_column])
8986
# filter and extract features
9087
x_values = feature_data[feature_data[node_info_column]==node_type][features].values
91-
standard_scaler = StandardScaler()
92-
minmax_scaler = MinMaxScaler()
93-
standard_scaler.fit(x_values)
94-
minmax_scaler.fit(x_values)
95-
save_scaler(standard_scaler, node_type, feature_group_name, standard_scaler_top_path)
96-
save_scaler(minmax_scaler, node_type, feature_group_name, minmax_scaler_top_path)
88+
max_scaler = MaxAbsScaler()
89+
max_scaler.fit(x_values)
90+
save_scaler(max_scaler, node_type, feature_group_name, max_scaler_top_path)

src/train/profiler/profiler.py

Lines changed: 10 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -137,28 +137,23 @@ def __init__(self, node_type):
137137
self.profile = dict()
138138
for source in PowerSourceMap.keys():
139139
self.profile[source] = dict()
140-
self.standard_scaler = dict()
141-
self.minmax_scaler = dict()
140+
self.max_scaler = dict()
142141
for feature_group in FeatureGroups.keys():
143142
#######################
144143
# uncomment to append from remote profile
145144
#
146145
# feature_key = feature_group.name
147-
# standard_scaler = Profile.load_scaler(self.node_type, feature_key, scaler_type="standard")
148-
# minmax_scaler = Profile.load_scaler(self.node_type, feature_key, scaler_type="minmax")
146+
# max_scaler = Profile.load_scaler(self.node_type, feature_key, scaler_type="maxabs")
149147
#######################
150-
standard_scaler = None
151-
minmax_scaler = None
152-
if standard_scaler is not None:
153-
self.standard_scaler[feature_group.name] = standard_scaler
154-
if minmax_scaler is not None:
155-
self.minmax_scaler[feature_group.name] = minmax_scaler
148+
max_scaler = None
149+
if max_scaler is not None:
150+
self.max_scaler[feature_group.name] = max_scaler
156151

157152
def add_profile(self, source, component, profile_value):
158153
self.profile[source][component] = profile_value
159154

160155
@staticmethod
161-
def load_scaler(node_type, feature_key, scaler_type): # scaler_type = minmax or standard
156+
def load_scaler(node_type, feature_key, scaler_type): # scaler_type = maxabs
162157
try:
163158
url_path = os.path.join(profiler_registry, scaler_type + "_scaler", str(node_type), feature_key + ".pkl")
164159
response = urlopen(url_path)
@@ -168,15 +163,10 @@ def load_scaler(node_type, feature_key, scaler_type): # scaler_type = minmax or
168163
print(url_path, e)
169164
return None
170165

171-
def get_minmax_scaler(self, feature_key):
172-
if feature_key not in self.minmax_scaler:
166+
def get_max_scaler(self, feature_key):
167+
if feature_key not in self.max_scaler:
173168
return None
174-
return self.minmax_scaler[feature_key]
175-
176-
def get_standard_scaler(self, feature_key):
177-
if feature_key not in self.standard_scaler:
178-
return None
179-
return self.standard_scaler[feature_key]
169+
return self.max_scaler[feature_key]
180170

181171
def get_background_power(self, source, component):
182172
if source not in self.profile:
@@ -191,7 +181,7 @@ def get_min_power(self, source, component):
191181
return self.profile[source][component][min_watt_key]
192182

193183
def print_profile(self):
194-
print("Profile (node type={}): \n Available energy components: {}\n Available minmax scalers: {}\n Available standard scalers: {}".format(self.node_type, ["{}/{}".format(key, list(self.profile[key].keys())) for key in self.profile.keys()], self.minmax_scaler.keys(), self.standard_scaler.keys()))
184+
print("Profile (node type={}): \n Available energy components: {}\n Available maxabs scalers: {}".format(self.node_type, ["{}/{}".format(key, list(self.profile[key].keys())) for key in self.profile.keys()], self.max_scaler.keys()))
195185

196186
def generate_profiles(profile_map):
197187
profiles = dict()

src/train/trainer/SGDRegressorTrainer/main.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
from sklearn.linear_model import SGDRegressor
2-
from sklearn.preprocessing import StandardScaler
2+
from sklearn.preprocessing import MaxAbsScaler
33
import joblib
44
from urllib.request import urlopen
55

@@ -12,7 +12,7 @@
1212

1313
class SGDRegressorTrainer(ScikitTrainer):
1414
def __init__(self, energy_components, feature_group, energy_source, node_level, pipeline_name):
15-
super(SGDRegressorTrainer, self).__init__(energy_components, feature_group, energy_source, node_level, scaler_type="standard", pipeline_name=pipeline_name)
15+
super(SGDRegressorTrainer, self).__init__(energy_components, feature_group, energy_source, node_level, pipeline_name=pipeline_name)
1616
self.fe_files = []
1717

1818
def init_model(self):

src/train/trainer/SVRRegressorTrainer/main.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
from sklearn.preprocessing import StandardScaler
1+
from sklearn.preprocessing import MaxAbsScaler
22
from sklearn.svm import SVR
33
from sklearn.pipeline import make_pipeline
44

@@ -12,7 +12,7 @@
1212

1313
class SVRRegressorTrainer(ScikitTrainer):
1414
def __init__(self, energy_components, feature_group, energy_source, node_level, pipeline_name):
15-
super(SVRRegressorTrainer, self).__init__(energy_components, feature_group, energy_source, node_level, scaler_type="standard", pipeline_name=pipeline_name)
15+
super(SVRRegressorTrainer, self).__init__(energy_components, feature_group, energy_source, node_level, pipeline_name=pipeline_name)
1616
self.fe_files = []
1717

1818
def init_model(self):

src/train/trainer/__init__.py

Lines changed: 3 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -23,7 +23,7 @@ def get_assured_checkpoint_path(group_path, assure=True):
2323

2424
import pandas as pd
2525
from sklearn.model_selection import train_test_split
26-
from sklearn.preprocessing import MinMaxScaler, StandardScaler
26+
from sklearn.preprocessing import MaxAbsScaler
2727

2828
def normalize_and_split(X_values, y_values, scaler, test_size=0.1):
2929
features = scaler.transform(X_values)
@@ -32,7 +32,7 @@ def normalize_and_split(X_values, y_values, scaler, test_size=0.1):
3232

3333

3434
class Trainer(metaclass=ABCMeta):
35-
def __init__(self, model_class, energy_components, feature_group, energy_source, node_level, pipeline_name, scaler_type="minmax"):
35+
def __init__(self, model_class, energy_components, feature_group, energy_source, node_level, pipeline_name, scaler_type="maxabs"):
3636
self.energy_components = energy_components
3737
self.feature_group_name = feature_group
3838
self.feature_group = FeatureGroup[feature_group]
@@ -133,10 +133,7 @@ def process(self, data, power_labels, pipeline_lock):
133133
self.print_log("fit scaler to latest data".format(node_type, self.feature_group_name))
134134
# no profiled scaler
135135
x_values = node_type_filtered_data[self.features].values
136-
if self.scaler_type == "standard":
137-
self.node_scalers[node_type] = StandardScaler()
138-
else:
139-
self.node_scalers[node_type] = MinMaxScaler()
136+
self.node_scalers[node_type] = MaxAbsScaler()
140137
self.node_scalers[node_type].fit(x_values)
141138

142139
X_test_map = dict()

src/train/trainer/scikit.py

Lines changed: 6 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -16,8 +16,7 @@ def get_save_path(model_filepath):
1616
return "/".join(model_filepath.split("/")[0:-1])
1717

1818
class ScikitTrainer(Trainer):
19-
def __init__(self, energy_components, feature_group, energy_source, node_level, pipeline_name, scaler_type="minmax"):
20-
self.is_standard_scaler = scaler_type == "standard"
19+
def __init__(self, energy_components, feature_group, energy_source, node_level, pipeline_name, scaler_type="maxabs"):
2120
super(ScikitTrainer, self).__init__(model_class, energy_components, feature_group, energy_source, node_level, pipeline_name, scaler_type=scaler_type)
2221
self.fe_files = []
2322

@@ -64,24 +63,23 @@ def save_model(self, component_save_path, node_type, component):
6463
def component_model_filename(self, component):
6564
return component + ".pkl"
6665

67-
def get_weight_dict(self, node_type):
68-
if not self.is_standard_scaler:
69-
# cannot get weight dict
70-
return None
66+
def get_weight_dict(self, node_type):
7167
weight_dict = dict()
7268

7369
for component, model in self.node_models[node_type].items():
7470
scaler = self.node_scalers[node_type]
7571
if not hasattr(model, "intercept_") or not hasattr(model, "coef_") or len(model.coef_) != len(self.features) or len(model.intercept_) != 1:
7672
return None
7773
else:
74+
# TODO: remove the mean and variance variables after updating the Kepler code
7875
weight_dict[component] = {
7976
"All_Weights": {
8077
"Bias_Weight": model.intercept_[0],
8178
"Categorical_Variables": dict(),
8279
"Numerical_Variables": {self.features[i]:
83-
{"mean": scaler.mean_[i],
84-
"variance": scaler.var_[i],
80+
{"scale": scaler.scale_[i],
81+
"mean": 0,
82+
"variance": 0,
8583
"weight": model.coef_[i],
8684
}
8785
for i in range(len(self.features))},

tests/common_plot.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -19,7 +19,7 @@
1919
from util.prom_types import TIMESTAMP_COL
2020
from util.extract_types import col_to_component
2121

22-
from sklearn.preprocessing import MinMaxScaler
22+
from sklearn.preprocessing import MaxAbsScaler
2323

2424
from train.extractor.preprocess import get_extracted_power_labels
2525
from estimate import get_label_power_colname
@@ -31,7 +31,7 @@ def _fig_filename(figname, save_path=plot_output_path):
3131
return os.path.join(save_path, figname + ".png")
3232

3333
def preprocess_data(df):
34-
scaler = MinMaxScaler()
34+
scaler = MaxAbsScaler()
3535
df = df.reset_index()
3636
normalized_data = scaler.fit_transform(df.values)
3737
normalized_df = pd.DataFrame(normalized_data, columns=df.columns, index=df.index)

0 commit comments

Comments
 (0)